From 42c6e6e1c40b2d2e21e74d1bdf3a2bcf4a5e7fa1 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 8 Aug 2017 14:09:13 -0700 Subject: [PATCH 01/23] flat param API impl --- python/sparkdl/graph/utils.py | 52 +++-- python/sparkdl/transformers/param.py | 124 ++++++++-- python/sparkdl/transformers/tf_tensor.py | 94 ++++++++ python/tests/transformers/tf_tensor_test.py | 236 ++++++++++++++++++++ 4 files changed, 475 insertions(+), 31 deletions(-) create mode 100644 python/sparkdl/transformers/tf_tensor.py create mode 100644 python/tests/transformers/tf_tensor_test.py diff --git a/python/sparkdl/graph/utils.py b/python/sparkdl/graph/utils.py index 45d8b065..75dec230 100644 --- a/python/sparkdl/graph/utils.py +++ b/python/sparkdl/graph/utils.py @@ -95,31 +95,49 @@ def get_tensor(graph, tfobj_or_name): 'cannot locate tensor {} in current graph'.format(_tensor_name) return tnsr -def as_tensor_name(name): +def as_tensor_name(tfobj_or_name): """ Derive tf.Tensor name from an op/tensor name. - We do not check if the tensor exist (as no graph parameter is passed in). + If the input is a name, we do not check if the tensor exist + (as no graph parameter is passed in). - :param name: op name or tensor name + :param tfobj_or_name: either a tf.Tensor, tf.Operation or a name to either """ - assert isinstance(name, six.string_types) - name_parts = name.split(":") - assert len(name_parts) <= 2, name_parts - if len(name_parts) < 2: - name += ":0" - return name + if isinstance(tfobj_or_name, six.string_types): + # If input is a string, assume it is a name and infer the corresponding tensor name. + # WARNING: this depends on TensorFlow's tensor naming convention + name = tfobj_or_name + name_parts = name.split(":") + assert len(name_parts) <= 2, name_parts + if len(name_parts) < 2: + name += ":0" + return name + elif hasattr(tfobj_or_name, 'graph'): + tfobj = tfobj_or_name + return get_tensor(tfobj.graph, tfobj).name + else: + raise TypeError('invalid tf.Tensor name query type {}'.format(type(tfobj_or_name))) -def as_op_name(name): +def as_op_name(tfobj_or_name): """ - Derive tf.Operation name from an op/tensor name - We do not check if the operation exist (as no graph parameter is passed in). + Derive tf.Operation name from an op/tensor name. + If the input is a name, we do not check if the operation exist + (as no graph parameter is passed in). - :param name: op name or tensor name + :param tfobj_or_name: either a tf.Tensor, tf.Operation or a name to either """ - assert isinstance(name, six.string_types) - name_parts = name.split(":") - assert len(name_parts) <= 2, name_parts - return name_parts[0] + if isinstance(tfobj_or_name, six.string_types): + # If input is a string, assume it is a name and infer the corresponding operation name. + # WARNING: this depends on TensorFlow's operation naming convention + name = tfobj_or_name + name_parts = name.split(":") + assert len(name_parts) <= 2, name_parts + return name_parts[0] + elif hasattr(tfobj_or_name, 'graph'): + tfobj = tfobj_or_name + return get_op(tfobj.graph, tfobj).name + else: + raise TypeError('invalid tf.Operation name query type {}'.format(type(tfobj_or_name))) def op_name(graph, tfobj_or_name): """ diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index f3d3cbaf..eb3d4188 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -20,14 +20,19 @@ """ from functools import wraps +import six import keras import tensorflow as tf from pyspark.ml.param import Param, Params, TypeConverters +from sparkdl.graph.builder import GraphFunction, IsolatedSession +import sparkdl.graph.utils as tfx -# From pyspark +""" +Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. +""" def keyword_only(func): """ @@ -50,7 +55,8 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) + inputCol = Param(Params._dummy(), "inputCol", "input column name.", + typeConverter=TypeConverters.toString) def __init__(self): super(HasInputCol, self).__init__() @@ -73,7 +79,8 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) + outputCol = Param(Params._dummy(), "outputCol", "output column name.", + typeConverter=TypeConverters.toString) def __init__(self): super(HasOutputCol, self).__init__() @@ -92,10 +99,44 @@ def getOutputCol(self): return self.getOrDefault(self.outputCol) -# New in sparkdl - +""" +TensorFlow Specific Parameters +""" class SparkDLTypeConverters(object): + @staticmethod + def toTFGraph(value): + if isinstance(value, tf.Graph): + return value + elif isinstance(value, GraphFunction): + with IsolatedSession() as issn: + issn.importGraphFunction(value, prefix='') + g = issn.graph + return g + else: + raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) + + @staticmethod + def asColumnToTensorMap(value): + if isinstance(value, dict): + strs_pair_seq = [(k, tfx.as_tensor_name(v)) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def asTensorToColumnMap(value): + if isinstance(value, dict): + strs_pair_seq = [(tfx.as_tensor_name(k), v) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def toTFHParams(value): + if isinstance(value, tf.contrib.training.HParams): + return value + else: + raise TypeError("Could not convert %s to TensorFlow HParams" % type(value)) + @staticmethod def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): @@ -106,15 +147,6 @@ def toStringOrTFTensor(value): except TypeError: raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) - @staticmethod - def toTFGraph(value): - # TODO: we may want to support tf.GraphDef in the future instead of tf.Graph since user - # is less likely to mess up using GraphDef vs Graph (e.g. constants vs variables). - if isinstance(value, tf.Graph): - return value - else: - raise TypeError("Could not convert %s to tensorflow.Graph type" % type(value)) - @staticmethod def supportedNameConverter(supportedList): def converter(value): @@ -122,3 +154,67 @@ def converter(value): return value else: raise TypeError("%s %s is not in the supported list." % type(value), str(value)) + return converter + + +class HasTFHParams(Params): + """ + Mixin for TensorFlow params + """ + hparam = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", + typeConverter=SparkDLTypeConverters.toTFHParams) + +# New in sparkdl + +class HasOutputMapping(Params): + """ + Mixin for param outputMapping: ordered list of ('outputTensorName', 'outputColName') pairs + """ + outputMapping = Param(Params._dummy(), "outputMapping", + "Mapping output :class:`tf.Tensor` objects to DataFrame column names", + typeConverter=SparkDLTypeConverters.asTensorToColumnMap) + + def __init__(self): + super(HasOutputMapping, self).__init__() + + def setOutputMapping(self, value): + return self._set(outputMapping=value) + + def getOutputMapping(self): + return self.getOrDefault(self.outputMapping) + + +class HasInputMapping(Params): + """ + Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorName') pairs + """ + inputMapping = Param(Params._dummy(), "inputMapping", + "Mapping input DataFrame column names to :class:`tf.Tensor` objects", + typeConverter=SparkDLTypeConverters.asColumnToTensorMap) + + def __init__(self): + super(HasInputMapping, self).__init__() + + def setInputMapping(self, value): + return self._set(inputMapping=value) + + def getInputMapping(self): + return self.getOrDefault(self.inputMapping) + + +class HasTFGraph(Params): + """ + Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. + """ + tfGraph = Param(Params._dummy(), "tfGraph", + "TensorFlow Graph object", + typeConverter=SparkDLTypeConverters.toTFGraph) + + def __init__(self): + super(HasTFGraph, self).__init__() + + def setTFGraph(self, value): + return self._set(tfGraph=value) + + def getTFGraph(self): + return self.getOrDefault(self.tfGraph) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py new file mode 100644 index 00000000..d2053c3d --- /dev/null +++ b/python/sparkdl/transformers/tf_tensor.py @@ -0,0 +1,94 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import absolute_import, division, print_function + +import logging +import numpy as np +import tensorflow as tf +import tensorframes as tfs + +from pyspark.ml import Transformer +from pyspark.ml.param import Param, Params +from pyspark.sql.functions import udf + +from sparkdl.graph.builder import IsolatedSession +import sparkdl.graph.utils as tfx +from sparkdl.transformers.param import ( + keyword_only, HasInputMapping, HasOutputMapping, SparkDLTypeConverters, + HasTFGraph, HasTFHParams) + +__all__ = ['TFTransformer'] + +logger = logging.getLogger('sparkdl') + +class TFTransformer(Transformer, HasTFGraph, HasTFHParams, HasInputMapping, HasOutputMapping): + """ + Applies the TensorFlow graph to the array column in DataFrame. + + Restrictions of the current API: + + We assume that + - All graphs have a "minibatch" dimension (i.e. an unknown leading + dimension) in the tensor shapes. + - Input DataFrame has an array column where all elements have the same length + """ + + @keyword_only + def __init__(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None): + """ + __init__(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None) + """ + super(TFTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None): + """ + setParams(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None) + """ + super(TFTransformer, self).__init__() + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, dataset): + df = dataset + output_renaming = {} + + with IsolatedSession(graph=self.getTFGraph()) as issn: + feeds = [] + for input_colname, tnsr in self.getInputMapping(): + feeds.append(tfx.get_tensor(issn.graph, tnsr)) + tf_expected_colname = tfx.op_name(issn.graph, tnsr) + df = df.withColumnRenamed(input_colname, tf_expected_colname) + + fetches = [] + for tnsr, output_colname in self.getOutputMapping(): + fetches.append(tfx.get_tensor(issn.graph, tnsr)) + tf_expected_colname = tfx.op_name(issn.graph, tnsr) + output_renaming[tf_expected_colname] = output_colname + + gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) + + analyzed_df = tfs.analyze(df) + + with IsolatedSession() as issn: + _, fetches = issn.importGraphFunction(gfn, prefix='') + out_df = tfs.map_blocks(fetches, analyzed_df) + + for old_colname, new_colname in output_renaming.items(): + out_df = out_df.withColumnRenamed(old_colname, new_colname) + + return out_df diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py new file mode 100644 index 00000000..495ff8e8 --- /dev/null +++ b/python/tests/transformers/tf_tensor_test.py @@ -0,0 +1,236 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import shutil +import tempfile + +from keras.layers import Conv1D, Dense, Flatten, MaxPool1D +import numpy as np +import tensorflow as tf +import tensorframes as tfs + +from pyspark.sql.types import Row + +from sparkdl.graph.builder import IsolatedSession +import sparkdl.graph.utils as tfx +from sparkdl.transformers.tf_tensor import TFTransformer + +from ..tests import SparkDLTestCase + +def grab_df_arr(df, output_col): + """ Stack the numpy array from a DataFrame column """ + return np.array([row.asDict()[output_col] + for row in df.select(output_col).toLocalIterator()]) + +class TFTransformerTest(SparkDLTestCase): + + def _get_rand_vec_df(self, num_rows, vec_size): + return self.session.createDataFrame( + Row(idx=idx, vec=np.random.randn(vec_size).tolist()) + for idx in range(num_rows)) + + def test_checkpoint_reload(self): + vec_size = 17 + num_vecs = 31 + df = self._get_rand_vec_df(num_vecs, vec_size) + analyzed_df = tfs.analyze(df) + input_col = 'vec' + output_col = 'outputCol' + + # Build the TensorFlow graph + model_temp_dir = tempfile.mkdtemp() + ckpt_dir = os.path.join(model_temp_dir, 'model_ckpt') + with tf.Session() as sess: + x = tf.placeholder(tf.float64, shape=[None, vec_size], name='tnsrIn') + w = tf.Variable(tf.random_normal([vec_size], dtype=tf.float64), + dtype=tf.float64, name='varW') + z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') + sess.run(w.initializer) + saver = tf.train.Saver(var_list=[w]) + saved_path = saver.save(sess, ckpt_dir, global_step=2702) + + # Get the reference data + _results = [] + for row in df.rdd.toLocalIterator(): + arr = np.array(row.vec)[np.newaxis, :] + _results.append(sess.run(z, {x: arr})) + out_ref = np.hstack(_results) + + # Load the saved model checkpoint + # We want to clear device assignment in order to run it anywhere we want + with IsolatedSession() as issn: + saver = tf.train.import_meta_graph('{}.meta'.format(saved_path), clear_devices=True) + saver.restore(issn.sess, saved_path) + gfn = issn.asGraphFunction( + [tfx.get_tensor(issn.graph, 'tnsrIn')], + [tfx.get_tensor(issn.graph, 'tnsrOut')]) + + transformer = TFTransformer(tfGraph=gfn, + inputMapping={ + input_col: 'tnsrIn' + }, + outputMapping={ + 'tnsrOut': output_col + }) + final_df = transformer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, output_col) + + shutil.rmtree(model_temp_dir, ignore_errors=True) + self.assertTrue(np.allclose(out_ref, out_tgt)) + + def test_simple(self): + # Build a simple input DataFrame + vec_size = 17 + num_vecs = 31 + df = self._get_rand_vec_df(num_vecs, vec_size) + analyzed_df = tfs.analyze(df) + + # Build the TensorFlow graph + with tf.Session() as sess: + #x = tf.placeholder(tf.float64, shape=[None, vec_size]) + x = tfs.block(analyzed_df, 'vec') + z = tf.reduce_mean(x, axis=1) + graph = sess.graph + + # Get the reference data + _results = [] + for row in df.rdd.toLocalIterator(): + arr = np.array(row.vec)[np.newaxis, :] + _results.append(sess.run(z, {x: arr})) + out_ref = np.hstack(_results) + + # Apply the transform + transfomer = TFTransformer(tfGraph=graph, + inputMapping={ + 'vec': x + }, + outputMapping={ + z: 'outCol' + }) + final_df = transfomer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, 'outCol') + + self.assertTrue(np.allclose(out_ref, out_tgt)) + + + def test_multi_io(self): + # Build a simple input DataFrame + vec_size = 17 + num_vecs = 31 + _df = self._get_rand_vec_df(num_vecs, vec_size) + df_x = _df.withColumnRenamed('vec', 'vec_x') + _df = self._get_rand_vec_df(num_vecs, vec_size) + df_y = _df.withColumnRenamed('vec', 'vec_y') + df = df_x.join(df_y, on='idx', how='inner') + analyzed_df = tfs.analyze(df) + + # Build the TensorFlow graph + with tf.Session() as sess: + x = tfs.block(analyzed_df, 'vec_x') + y = tfs.block(analyzed_df, 'vec_y') + p = tf.reduce_mean(x + y, axis=1) + q = tf.reduce_mean(x - y, axis=1) + graph = sess.graph + + # Get the reference data + p_out_ref = [] + q_out_ref = [] + for row in df.rdd.toLocalIterator(): + arr_x = np.array(row['vec_x'])[np.newaxis, :] + arr_y = np.array(row['vec_y'])[np.newaxis, :] + p_val, q_val = sess.run([p, q], {x: arr_x, y: arr_y}) + p_out_ref.append(p_val) + q_out_ref.append(q_val) + p_out_ref = np.hstack(p_out_ref) + q_out_ref = np.hstack(q_out_ref) + + # Apply the transform + transfomer = TFTransformer(tfGraph=graph, + inputMapping={ + 'vec_x': x, + 'vec_y': y + }, + outputMapping={ + p: 'outcol_p', + q: 'outcol_q' + }) + final_df = transfomer.transform(analyzed_df) + p_out_tgt = grab_df_arr(final_df, 'outcol_p') + q_out_tgt = grab_df_arr(final_df, 'outcol_q') + + self.assertTrue(np.allclose(p_out_ref, p_out_tgt)) + self.assertTrue(np.allclose(q_out_ref, q_out_tgt)) + + def test_map_blocks_graph(self): + + vec_size = 17 + num_vecs = 137 + df = self._get_rand_vec_df(num_vecs, vec_size) + analyzed_df = tfs.analyze(df) + + input_col = 'vec' + output_col = 'outCol' + + # Build the graph: the output should have the same leading/batch dimension + with IsolatedSession(using_keras=True) as issn: + tnsr_in = tfs.block(analyzed_df, input_col) + inp = tf.expand_dims(tnsr_in, axis=2) + # Keras layers does not take tf.double + inp = tf.cast(inp, tf.float32) + conv = Conv1D(filters=4, kernel_size=2)(inp) + pool = MaxPool1D(pool_size=2)(conv) + flat = Flatten()(pool) + dense = Dense(1)(flat) + # We must keep the leading dimension of the output + redsum = tf.reduce_sum(dense, axis=1) + tnsr_out = tf.cast(redsum, tf.double, name='TnsrOut') + + # Initialize the variables + init_op = tf.global_variables_initializer() + issn.run(init_op) + # We could train the model ... but skip it here + gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) + + with IsolatedSession() as issn: + # Import the graph function object + feeds, fetches = issn.importGraphFunction(gfn, prefix='') + + # Rename the input column name to the feed op's name + orig_in_name = tfx.op_name(issn.graph, feeds[0]) + input_df = analyzed_df.withColumnRenamed(input_col, orig_in_name) + + # Do the actual computation + output_df = tfs.map_blocks(fetches, input_df) + + # Rename the output column (by default, the name of the fetch op's name) + orig_out_name = tfx.op_name(issn.graph, fetches[0]) + final_df = output_df.withColumnRenamed(orig_out_name, output_col) + + arr_ref = grab_df_arr(final_df, output_col) + + # Using the Transformer + transformer = TFTransformer(tfGraph=gfn, + inputMapping={ + input_col: gfn.input_names[0] + }, + outputMapping={ + gfn.output_names[0]: output_col + }) + transformed_df = transformer.transform(analyzed_df) + + arr_tgt = grab_df_arr(transformed_df, output_col) + + self.assertTrue(np.allclose(arr_ref, arr_tgt)) + From ecbefb948c8cbe8d66183493833732908e61d398 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 25 Aug 2017 00:44:16 -0700 Subject: [PATCH 02/23] support input graph scenarios --- python/sparkdl/transformers/param.py | 79 ++++++++- python/sparkdl/transformers/tf_tensor.py | 108 +++++++++--- python/tests/transformers/tf_tensor_test.py | 183 ++++++++++++++------ 3 files changed, 293 insertions(+), 77 deletions(-) diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index eb3d4188..e11789d0 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -119,14 +119,14 @@ def toTFGraph(value): @staticmethod def asColumnToTensorMap(value): if isinstance(value, dict): - strs_pair_seq = [(k, tfx.as_tensor_name(v)) for k, v in value.items()] + strs_pair_seq = [(k, tfx.as_op_name(v)) for k, v in value.items()] return sorted(strs_pair_seq) raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) @staticmethod def asTensorToColumnMap(value): if isinstance(value, dict): - strs_pair_seq = [(tfx.as_tensor_name(k), v) for k, v in value.items()] + strs_pair_seq = [(tfx.as_op_name(k), v) for k, v in value.items()] return sorted(strs_pair_seq) raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) @@ -161,8 +161,8 @@ class HasTFHParams(Params): """ Mixin for TensorFlow params """ - hparam = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", - typeConverter=SparkDLTypeConverters.toTFHParams) + tfHParms = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", + typeConverter=SparkDLTypeConverters.toTFHParams) # New in sparkdl @@ -202,6 +202,76 @@ def getInputMapping(self): return self.getOrDefault(self.inputMapping) +class HasTagSet(Params): + # TODO: add docs + tagSet = Param(Params._dummy(), "tagSet", + "signature def tag set", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasTagSet, self).__init__() + # TODO: add default value + + def setTagSet(self, value): + return self._set(tagSet=value) + + def getTagSet(self): + return self.getOrDefault(self.tagSet) + + +class HasSignatureDefKey(Params): + # TODO: add docs + signatureDefKey = Param(Params._dummy(), "signatureDefKey", + "signature def", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasSignatureDefKey, self).__init__() + # TODO: add default value + + def setSignatureDefKey(self, value): + return self._set(signatureDefKey=value) + + def getSignatureDefKey(self): + return self.getOrDefault(self.signatureDefKey) + + +class HasExportDir(Params): + """ + Mixin for param for constructing inputs + """ + exportDir = Param(Params._dummy(), "exportDir", + "Directory of saved model", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasExportDir, self).__init__() + + def setExportDir(self, value): + return self._set(exportDir=value) + + def getExportDir(self): + return self.getOrDefault(self.exportDir) + + +class HasTFCheckpointDir(Params): + """ + Mixin for TensorFlow model checkpoint + """ + tfCheckpointDir = Param(Params._dummy(), "tfCheckpointDir", + "Directory that contains a model checkpoint", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasTFCheckpointDir, self).__init__() + + def setTFCheckpointDir(self, value): + return self._set(tfCheckpointDir=value) + + def getTFCheckpointDir(self): + return self.getOrDefault(self.tfCheckpointDir) + + class HasTFGraph(Params): """ Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. @@ -212,6 +282,7 @@ class HasTFGraph(Params): def __init__(self): super(HasTFGraph, self).__init__() + self._setDefault(tfGraph=None) def setTFGraph(self, value): return self._set(tfGraph=value) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index d2053c3d..24ae4365 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -27,13 +27,15 @@ import sparkdl.graph.utils as tfx from sparkdl.transformers.param import ( keyword_only, HasInputMapping, HasOutputMapping, SparkDLTypeConverters, - HasTFGraph, HasTFHParams) + HasTFGraph, HasTFHParams, HasTFCheckpointDir, HasExportDir, HasTagSet, HasSignatureDefKey) __all__ = ['TFTransformer'] logger = logging.getLogger('sparkdl') -class TFTransformer(Transformer, HasTFGraph, HasTFHParams, HasInputMapping, HasOutputMapping): +class TFTransformer(Transformer, HasTFCheckpointDir, HasTFGraph, + HasExportDir, HasTagSet, HasSignatureDefKey, + HasTFHParams, HasInputMapping, HasOutputMapping): """ Applies the TensorFlow graph to the array column in DataFrame. @@ -46,49 +48,111 @@ class TFTransformer(Transformer, HasTFGraph, HasTFHParams, HasInputMapping, HasO """ @keyword_only - def __init__(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None): + def __init__(self, tfCheckpointDir=None, tfGraph=None, + exportDir=None, tagSet=None, signatureDefKey=None, + inputMapping=None, outputMapping=None, tfHParms=None): """ - __init__(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None) + __init__(self, tfCheckpointDir=None, tfGraph=None, + exportDir=None, tagSet=None, signatureDefKey=None, + inputMapping=None, outputMapping=None, tfHParms=None) """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs) + @keyword_only - def setParams(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None): + def setParams(self, tfCheckpointDir=None, tfGraph=None, + exportDir=None, tagSet=None, signatureDefKey=None, + inputMapping=None, outputMapping=None, tfHParms=None): """ - setParams(self, inputMapping=None, outputMapping=None, tfGraph=None, hparams=None) + setParams(self, tfCheckpointDir=None, tfGraph=None, + exportDir=None, tagSet=None, signatureDefKey=None, + inputMapping=None, outputMapping=None, tfHParms=None) """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs return self._set(**kwargs) - def _transform(self, dataset): - df = dataset - output_renaming = {} - with IsolatedSession(graph=self.getTFGraph()) as issn: + def _convertInternal(self): + assert self.isDefined(self.inputMapping) and self.isDefined(self.outputMapping), \ + "inputMapping and outputMapping must be defined" + + _maybe_graph = self.getTFGraph() + _maybe_meta_graph_def = None + with IsolatedSession(graph=_maybe_graph) as issn: + if self.isDefined(self.exportDir): + assert _maybe_graph is None + assert not self.isDefined(self.tfCheckpointDir) + tag_set = self.getTagSet().split(',') + _maybe_meta_graph_def = tf.saved_model.loader.load( + issn.sess, tag_set, self.getExportDir()) + elif self.isDefined(self.tfCheckpointDir): + assert _maybe_graph is None + ckpt_dir = self.getTFCheckpointDir() + ckpt_path = tf.train.latest_checkpoint(ckpt_dir) + print('using checkpoint path from {} as {}'.format(ckpt_dir, ckpt_path)) + saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) + saver.restore(issn.sess, ckpt_path) + _maybe_meta_graph_def = saver.export_meta_graph(clear_devices=True) + else: + assert _maybe_graph is not None + + sig_def = None + if self.isDefined(self.signatureDefKey): + sig_def_key = self.getSignatureDefKey() + if sig_def_key is not None: + meta_graph_def = _maybe_meta_graph_def + assert meta_graph_def is not None + #print('sigdef:', meta_graph_def.signature_def) + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, sig_def_key) + assert sig_def is not None + feeds = [] - for input_colname, tnsr in self.getInputMapping(): + _input_mapping = {} + for input_colname, tnsr_or_sig in self.getInputMapping(): + if sig_def: + tnsr = sig_def.inputs[tnsr_or_sig].name + _input_mapping[input_colname] = tfx.op_name(issn.graph, tnsr) + else: + tnsr = tnsr_or_sig feeds.append(tfx.get_tensor(issn.graph, tnsr)) - tf_expected_colname = tfx.op_name(issn.graph, tnsr) - df = df.withColumnRenamed(input_colname, tf_expected_colname) + + if sig_def: + self.setInputMapping(_input_mapping) fetches = [] - for tnsr, output_colname in self.getOutputMapping(): + # By default the output columns will have the name of their + # corresponding `tf.Graph` operation names. + # We have to convert them to the user specified output names + self.output_renaming = {} + for tnsr_or_sig, output_colname in self.getOutputMapping(): + if sig_def: + tnsr = sig_def.outputs[tnsr_or_sig].name + else: + tnsr = tnsr_or_sig fetches.append(tfx.get_tensor(issn.graph, tnsr)) tf_expected_colname = tfx.op_name(issn.graph, tnsr) - output_renaming[tf_expected_colname] = output_colname + self.output_renaming[tf_expected_colname] = output_colname - gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) + # Consolidate the input format into a serialized format + self.gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) - analyzed_df = tfs.analyze(df) - with IsolatedSession() as issn: - _, fetches = issn.importGraphFunction(gfn, prefix='') - out_df = tfs.map_blocks(fetches, analyzed_df) + def _transform(self, dataset): + self._convertInternal() - for old_colname, new_colname in output_renaming.items(): - out_df = out_df.withColumnRenamed(old_colname, new_colname) + with IsolatedSession() as issn: + analyzed_df = tfs.analyze(dataset) + _, fetches = issn.importGraphFunction(self.gfn, prefix='') + feed_dict = dict([(tnsr_name, col_name) for col_name, tnsr_name in self.getInputMapping()]) + out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) + + # We still have to rename output columns + for old_colname, new_colname in self.output_renaming.items(): + if old_colname != new_colname: + out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 495ff8e8..296709c8 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from __future__ import absolute_import, division, print_function + import os import shutil import tempfile @@ -32,7 +34,7 @@ def grab_df_arr(df, output_col): """ Stack the numpy array from a DataFrame column """ return np.array([row.asDict()[output_col] - for row in df.select(output_col).toLocalIterator()]) + for row in df.select(output_col).collect()]) class TFTransformerTest(SparkDLTestCase): @@ -41,87 +43,167 @@ def _get_rand_vec_df(self, num_rows, vec_size): Row(idx=idx, vec=np.random.randn(vec_size).tolist()) for idx in range(num_rows)) - def test_checkpoint_reload(self): + def test_build_from_tf_graph(self): + # Build a simple input DataFrame vec_size = 17 num_vecs = 31 df = self._get_rand_vec_df(num_vecs, vec_size) analyzed_df = tfs.analyze(df) - input_col = 'vec' - output_col = 'outputCol' # Build the TensorFlow graph - model_temp_dir = tempfile.mkdtemp() - ckpt_dir = os.path.join(model_temp_dir, 'model_ckpt') with tf.Session() as sess: + #x = tf.placeholder(tf.float64, shape=[None, vec_size]) + x = tfs.block(analyzed_df, 'vec') + z = tf.reduce_mean(x, axis=1) + graph = sess.graph + + # Get the reference data + _results = [] + for row in df.collect(): + arr = np.array(row.vec)[np.newaxis, :] + _results.append(sess.run(z, {x: arr})) + out_ref = np.hstack(_results) + + # Apply the transform + transfomer = TFTransformer(tfGraph=graph, + inputMapping={ + 'vec': x + }, + outputMapping={ + z: 'outCol' + }) + final_df = transfomer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, 'outCol') + + self.assertTrue(np.allclose(out_ref, out_tgt)) + + + def test_build_from_saved_model(self): + # Setup dataset + vec_size = 17 + num_vecs = 31 + df = self._get_rand_vec_df(num_vecs, vec_size) + analyzed_df = tfs.analyze(df) + input_col = 'vec' + output_col = 'outputCol' + + # Setup saved model export directory + saved_model_root = tempfile.mkdtemp() + saved_model_dir = os.path.join(saved_model_root, 'saved_model') + serving_tag = "serving_tag" + serving_sigdef_key = 'prediction_signature' + + builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) + with tf.Session(graph=tf.Graph()) as sess: + # Model definition: begin x = tf.placeholder(tf.float64, shape=[None, vec_size], name='tnsrIn') + #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) w = tf.Variable(tf.random_normal([vec_size], dtype=tf.float64), dtype=tf.float64, name='varW') z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') + # Model definition ends + sess.run(w.initializer) - saver = tf.train.Saver(var_list=[w]) - saved_path = saver.save(sess, ckpt_dir, global_step=2702) + sig_inputs = { + 'input_sig': tf.saved_model.utils.build_tensor_info(x)} + sig_outputs = { + 'output_sig': tf.saved_model.utils.build_tensor_info(z)} + + serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + inputs=sig_inputs, + outputs=sig_outputs) + + builder.add_meta_graph_and_variables(sess, + [serving_tag], + signature_def_map={ + serving_sigdef_key: serving_sigdef + }) # Get the reference data _results = [] - for row in df.rdd.toLocalIterator(): + for row in df.collect(): arr = np.array(row.vec)[np.newaxis, :] _results.append(sess.run(z, {x: arr})) out_ref = np.hstack(_results) - # Load the saved model checkpoint - # We want to clear device assignment in order to run it anywhere we want - with IsolatedSession() as issn: - saver = tf.train.import_meta_graph('{}.meta'.format(saved_path), clear_devices=True) - saver.restore(issn.sess, saved_path) - gfn = issn.asGraphFunction( - [tfx.get_tensor(issn.graph, 'tnsrIn')], - [tfx.get_tensor(issn.graph, 'tnsrOut')]) - - transformer = TFTransformer(tfGraph=gfn, - inputMapping={ - input_col: 'tnsrIn' - }, - outputMapping={ - 'tnsrOut': output_col - }) - final_df = transformer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, output_col) - - shutil.rmtree(model_temp_dir, ignore_errors=True) - self.assertTrue(np.allclose(out_ref, out_tgt)) - - def test_simple(self): - # Build a simple input DataFrame + # Save the model + builder.save() + + # Build the transformer from exported serving model + # We are using signaures, thus must provide the keys + trans_with_sig = TFTransformer(exportDir=saved_model_dir, + signatureDefKey=serving_sigdef_key, + tagSet=serving_tag, + inputMapping={ + input_col: 'input_sig' + }, + outputMapping={ + 'output_sig': output_col + }) + + # Build the transformer from exported serving model + # We are not using signatures, thus must provide tensor/operation names + trans_no_sig = TFTransformer(exportDir=saved_model_dir, + signatureDefKey=None, + tagSet=serving_tag, + inputMapping={ + input_col: 'tnsrIn' + }, + outputMapping={ + 'tnsrOut': output_col + }) + + df_trans_with_sig = trans_with_sig.transform(analyzed_df) + df_trans_no_sig = trans_no_sig.transform(analyzed_df) + out_with_sig_tgt = grab_df_arr(df_trans_with_sig, output_col) + out_no_sig_tgt = grab_df_arr(df_trans_no_sig, output_col) + # Cleanup the resources + shutil.rmtree(saved_model_root, ignore_errors=True) + self.assertTrue(np.allclose(out_ref, out_with_sig_tgt)) + self.assertTrue(np.allclose(out_ref, out_no_sig_tgt)) + + + def test_build_from_checkpoint(self): vec_size = 17 num_vecs = 31 df = self._get_rand_vec_df(num_vecs, vec_size) analyzed_df = tfs.analyze(df) + input_col = 'vec' + output_col = 'outputCol' # Build the TensorFlow graph - with tf.Session() as sess: - #x = tf.placeholder(tf.float64, shape=[None, vec_size]) - x = tfs.block(analyzed_df, 'vec') - z = tf.reduce_mean(x, axis=1) - graph = sess.graph + model_ckpt_dir = tempfile.mkdtemp() + ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') + # Warning: please use a new graph for each test cases + # or the tests could affect one another + with tf.Session(graph=tf.Graph()) as sess: + x = tf.placeholder(tf.float64, shape=[None, vec_size], name='tnsrIn') + #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) + w = tf.Variable(tf.random_normal([vec_size], dtype=tf.float64), + dtype=tf.float64, name='varW') + z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') + sess.run(w.initializer) + saver = tf.train.Saver(var_list=[w]) + _ = saver.save(sess, ckpt_path_prefix, global_step=2702) # Get the reference data _results = [] - for row in df.rdd.toLocalIterator(): + for row in df.collect(): arr = np.array(row.vec)[np.newaxis, :] _results.append(sess.run(z, {x: arr})) out_ref = np.hstack(_results) - # Apply the transform - transfomer = TFTransformer(tfGraph=graph, - inputMapping={ - 'vec': x - }, - outputMapping={ - z: 'outCol' - }) - final_df = transfomer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, 'outCol') + transformer = TFTransformer(tfCheckpointDir=model_ckpt_dir, + inputMapping={ + input_col: 'tnsrIn' + }, + outputMapping={ + 'tnsrOut': output_col + }) + final_df = transformer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, output_col) + shutil.rmtree(model_ckpt_dir, ignore_errors=True) self.assertTrue(np.allclose(out_ref, out_tgt)) @@ -147,7 +229,7 @@ def test_multi_io(self): # Get the reference data p_out_ref = [] q_out_ref = [] - for row in df.rdd.toLocalIterator(): + for row in df.collect(): arr_x = np.array(row['vec_x'])[np.newaxis, :] arr_y = np.array(row['vec_y'])[np.newaxis, :] p_val, q_val = sess.run([p, q], {x: arr_x, y: arr_y}) @@ -233,4 +315,3 @@ def test_map_blocks_graph(self): arr_tgt = grab_df_arr(transformed_df, output_col) self.assertTrue(np.allclose(arr_ref, arr_tgt)) - From ab89bd271e7ee964f6ed24a7b836e591eacf7fe7 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 8 Sep 2017 18:27:56 -0700 Subject: [PATCH 03/23] (WIP) new interface implementation --- python/sparkdl/transformers/param.py | 121 +++-------- python/sparkdl/transformers/tf_tensor.py | 227 ++++++++++++-------- python/tests/transformers/tf_tensor_test.py | 102 ++++----- 3 files changed, 219 insertions(+), 231 deletions(-) diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index e11789d0..90edbad0 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -58,9 +58,6 @@ class HasInputCol(Params): inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) - def __init__(self): - super(HasInputCol, self).__init__() - def setInputCol(self, value): """ Sets the value of :py:attr:`inputCol`. @@ -116,6 +113,16 @@ def toTFGraph(value): else: raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) + @staticmethod + def toTFInputGraph(value): + return value + # if isinstance(value, tf.Graph): + # return value.as_graph_def(add_shapes=True) + # elif isinstance(value, tf.GraphDef): + # return value + # else: + # raise TypeError("Could not convert %s to TFInputGraph" % type(value)) + @staticmethod def asColumnToTensorMap(value): if isinstance(value, dict): @@ -156,14 +163,6 @@ def converter(value): raise TypeError("%s %s is not in the supported list." % type(value), str(value)) return converter - -class HasTFHParams(Params): - """ - Mixin for TensorFlow params - """ - tfHParms = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", - typeConverter=SparkDLTypeConverters.toTFHParams) - # New in sparkdl class HasOutputMapping(Params): @@ -174,9 +173,6 @@ class HasOutputMapping(Params): "Mapping output :class:`tf.Tensor` objects to DataFrame column names", typeConverter=SparkDLTypeConverters.asTensorToColumnMap) - def __init__(self): - super(HasOutputMapping, self).__init__() - def setOutputMapping(self, value): return self._set(outputMapping=value) @@ -192,9 +188,6 @@ class HasInputMapping(Params): "Mapping input DataFrame column names to :class:`tf.Tensor` objects", typeConverter=SparkDLTypeConverters.asColumnToTensorMap) - def __init__(self): - super(HasInputMapping, self).__init__() - def setInputMapping(self, value): return self._set(inputMapping=value) @@ -202,90 +195,34 @@ def getInputMapping(self): return self.getOrDefault(self.inputMapping) -class HasTagSet(Params): - # TODO: add docs - tagSet = Param(Params._dummy(), "tagSet", - "signature def tag set", - typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasTagSet, self).__init__() - # TODO: add default value - - def setTagSet(self, value): - return self._set(tagSet=value) - - def getTagSet(self): - return self.getOrDefault(self.tagSet) - - -class HasSignatureDefKey(Params): - # TODO: add docs - signatureDefKey = Param(Params._dummy(), "signatureDefKey", - "signature def", - typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasSignatureDefKey, self).__init__() - # TODO: add default value - - def setSignatureDefKey(self, value): - return self._set(signatureDefKey=value) - - def getSignatureDefKey(self): - return self.getOrDefault(self.signatureDefKey) - - -class HasExportDir(Params): +class HasTFInputGraph(Params): """ - Mixin for param for constructing inputs - """ - exportDir = Param(Params._dummy(), "exportDir", - "Directory of saved model", - typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasExportDir, self).__init__() - - def setExportDir(self, value): - return self._set(exportDir=value) - - def getExportDir(self): - return self.getOrDefault(self.exportDir) - - -class HasTFCheckpointDir(Params): - """ - Mixin for TensorFlow model checkpoint + Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. """ - tfCheckpointDir = Param(Params._dummy(), "tfCheckpointDir", - "Directory that contains a model checkpoint", - typeConverter=TypeConverters.toString) + tfInputGraph = Param(Params._dummy(), "tfInputGraph", + "TensorFlow Graph object", + typeConverter=SparkDLTypeConverters.toTFInputGraph) def __init__(self): - super(HasTFCheckpointDir, self).__init__() + super(HasTFInputGraph, self).__init__() + self._setDefault(tfInputGraph=None) - def setTFCheckpointDir(self, value): - return self._set(tfCheckpointDir=value) + def setTFInputGraph(self, value): + return self._set(tfInputGraph=value) - def getTFCheckpointDir(self): - return self.getOrDefault(self.tfCheckpointDir) + def getTFInputGraph(self): + return self.getOrDefault(self.tfInputGraph) -class HasTFGraph(Params): +class HasTFHParams(Params): """ - Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. + Mixin for TensorFlow model hyper-parameters """ - tfGraph = Param(Params._dummy(), "tfGraph", - "TensorFlow Graph object", - typeConverter=SparkDLTypeConverters.toTFGraph) - - def __init__(self): - super(HasTFGraph, self).__init__() - self._setDefault(tfGraph=None) + tfHParams = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", + typeConverter=SparkDLTypeConverters.toTFHParams) - def setTFGraph(self, value): - return self._set(tfGraph=value) + def setTFHParams(self, value): + return self._set(tfHParam=value) - def getTFGraph(self): - return self.getOrDefault(self.tfGraph) + def getTFHParams(self): + return self.getOrDefault(self.tfHParams) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 24ae4365..54c05460 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -23,135 +23,182 @@ from pyspark.ml.param import Param, Params from pyspark.sql.functions import udf -from sparkdl.graph.builder import IsolatedSession +from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx from sparkdl.transformers.param import ( - keyword_only, HasInputMapping, HasOutputMapping, SparkDLTypeConverters, - HasTFGraph, HasTFHParams, HasTFCheckpointDir, HasExportDir, HasTagSet, HasSignatureDefKey) + keyword_only, SparkDLTypeConverters, HasInputMapping, + HasOutputMapping, HasTFInputGraph, HasTFHParams) __all__ = ['TFTransformer'] logger = logging.getLogger('sparkdl') -class TFTransformer(Transformer, HasTFCheckpointDir, HasTFGraph, - HasExportDir, HasTagSet, HasSignatureDefKey, - HasTFHParams, HasInputMapping, HasOutputMapping): +class TFInputGraph(object): + def __init__(self, graph_function, input_mapping, output_mapping): + # GraphFunction + self.graph_function = graph_function + # type: (str, str) list + if isinstance(input_mapping, dict): + input_mapping = input_mapping.items() + self.input_mapping = sorted(input_mapping) + # type: (str, str) list + if isinstance(output_mapping, dict): + output_mapping = output_mapping.items() + self.output_mapping = sorted(output_mapping) + +class TFInputGraphBuilder(object): """ - Applies the TensorFlow graph to the array column in DataFrame. - - Restrictions of the current API: - - We assume that - - All graphs have a "minibatch" dimension (i.e. an unknown leading - dimension) in the tensor shapes. - - Input DataFrame has an array column where all elements have the same length + Create a builder function so as to be able to compile graph for inference. + The actual compilation will be done at the time when the + inputs (feeds) and outputs (fetches) are provided. """ + def __init__(self, graph_import_fn): + # Return graph_def, input_mapping, output_mapping + self.graph_import_fn = graph_import_fn - @keyword_only - def __init__(self, tfCheckpointDir=None, tfGraph=None, - exportDir=None, tagSet=None, signatureDefKey=None, - inputMapping=None, outputMapping=None, tfHParms=None): - """ - __init__(self, tfCheckpointDir=None, tfGraph=None, - exportDir=None, tagSet=None, signatureDefKey=None, - inputMapping=None, outputMapping=None, tfHParms=None) - """ - super(TFTransformer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - - @keyword_only - def setParams(self, tfCheckpointDir=None, tfGraph=None, - exportDir=None, tagSet=None, signatureDefKey=None, - inputMapping=None, outputMapping=None, tfHParms=None): - """ - setParams(self, tfCheckpointDir=None, tfGraph=None, - exportDir=None, tagSet=None, signatureDefKey=None, - inputMapping=None, outputMapping=None, tfHParms=None) - """ - super(TFTransformer, self).__init__() - kwargs = self._input_kwargs - return self._set(**kwargs) - + def build(self, input_mapping, output_mapping): - def _convertInternal(self): - assert self.isDefined(self.inputMapping) and self.isDefined(self.outputMapping), \ - "inputMapping and outputMapping must be defined" - - _maybe_graph = self.getTFGraph() - _maybe_meta_graph_def = None - with IsolatedSession(graph=_maybe_graph) as issn: - if self.isDefined(self.exportDir): - assert _maybe_graph is None - assert not self.isDefined(self.tfCheckpointDir) - tag_set = self.getTagSet().split(',') - _maybe_meta_graph_def = tf.saved_model.loader.load( - issn.sess, tag_set, self.getExportDir()) - elif self.isDefined(self.tfCheckpointDir): - assert _maybe_graph is None - ckpt_dir = self.getTFCheckpointDir() - ckpt_path = tf.train.latest_checkpoint(ckpt_dir) - print('using checkpoint path from {} as {}'.format(ckpt_dir, ckpt_path)) - saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) - saver.restore(issn.sess, ckpt_path) - _maybe_meta_graph_def = saver.export_meta_graph(clear_devices=True) - else: - assert _maybe_graph is not None - - sig_def = None - if self.isDefined(self.signatureDefKey): - sig_def_key = self.getSignatureDefKey() - if sig_def_key is not None: - meta_graph_def = _maybe_meta_graph_def - assert meta_graph_def is not None - #print('sigdef:', meta_graph_def.signature_def) - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, sig_def_key) - assert sig_def is not None + with IsolatedSession() as issn: + sig_def = self.graph_import_fn(issn.sess) + # Append feeds and input mapping feeds = [] _input_mapping = {} - for input_colname, tnsr_or_sig in self.getInputMapping(): + for input_colname, tnsr_or_sig in input_mapping.items(): if sig_def: tnsr = sig_def.inputs[tnsr_or_sig].name - _input_mapping[input_colname] = tfx.op_name(issn.graph, tnsr) else: tnsr = tnsr_or_sig + _input_mapping[input_colname] = tfx.op_name(issn.graph, tnsr) feeds.append(tfx.get_tensor(issn.graph, tnsr)) + input_mapping = _input_mapping - if sig_def: - self.setInputMapping(_input_mapping) - + # Append fetches and output mapping fetches = [] + _output_mapping = {} # By default the output columns will have the name of their # corresponding `tf.Graph` operation names. # We have to convert them to the user specified output names - self.output_renaming = {} - for tnsr_or_sig, output_colname in self.getOutputMapping(): + for tnsr_or_sig, requested_colname in output_mapping.items(): if sig_def: tnsr = sig_def.outputs[tnsr_or_sig].name else: tnsr = tnsr_or_sig fetches.append(tfx.get_tensor(issn.graph, tnsr)) - tf_expected_colname = tfx.op_name(issn.graph, tnsr) - self.output_renaming[tf_expected_colname] = output_colname + tf_output_colname = tfx.op_name(issn.graph, tnsr) + _output_mapping[tf_output_colname] = requested_colname + output_mapping = _output_mapping + + gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) + + return TFInputGraph(gfn, input_mapping, output_mapping) + + @classmethod + def fromGraph(cls, graph): + assert isinstance(graph, tf.Graph), \ + ('expect tf.Graph type but got', type(graph)) + + def import_graph_fn(sess): + #graph.finalize() + gdef = graph.as_graph_def(add_shapes=True) + tf.import_graph_def(gdef, name='') + return None # no meta_graph_def + + return cls(import_graph_fn) + + @classmethod + def fromGraphDef(cls, graph_def): + assert isinstance(graph_def, tf.GraphDef), \ + ('expect tf.GraphDef type but got', type(graph_def)) + + def import_graph_fn(sess): + tf.import_graph_def(graph_def, name='') + return None + + return cls(import_graph_fn) + + @classmethod + def fromCheckpointDir(cls, checkpoint_dir, signature_def_key=None): - # Consolidate the input format into a serialized format - self.gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) + def import_graph_fn(sess): + # Load checkpoint and import the graph + ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) + saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) + saver.restore(sess, ckpt_path) + meta_graph_def = saver.export_meta_graph(clear_devices=True) + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) + + @classmethod + def fromSavedModelDir(cls, saved_model_dir, tag_set, signature_def_key=None): + + def import_graph_fn(sess): + tag_sets = tag_set.split(',') + meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) + + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) + + +class TFTransformer(Transformer, HasTFInputGraph, HasTFHParams, HasInputMapping, HasOutputMapping): + """ + Applies the TensorFlow graph to the array column in DataFrame. + + Restrictions of the current API: + + We assume that + - All graphs have a "minibatch" dimension (i.e. an unknown leading + dimension) in the tensor shapes. + - Input DataFrame has an array column where all elements have the same length + """ + + @keyword_only + def __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None): + """ + __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None) + """ + super(TFTransformer, self).__init__() + kwargs = self._input_kwargs + gin = tfInputGraph.build(inputMapping, outputMapping) + kwargs['tfInputGraph'] = gin + self.setParams(**kwargs) + + @keyword_only + def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None): + """ + setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None) + """ + super(TFTransformer, self).__init__() + kwargs = self._input_kwargs + return self._set(**kwargs) def _transform(self, dataset): - self._convertInternal() + gin = self.getTFInputGraph() + input_mapping = gin.input_mapping + output_mapping = gin.output_mapping with IsolatedSession() as issn: analyzed_df = tfs.analyze(dataset) - _, fetches = issn.importGraphFunction(self.gfn, prefix='') - feed_dict = dict([(tnsr_name, col_name) for col_name, tnsr_name in self.getInputMapping()]) + _, fetches = issn.importGraphFunction(gin.graph_function, prefix='') + feed_dict = dict([(tnsr_name, col_name) for col_name, tnsr_name in input_mapping]) + out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns - for old_colname, new_colname in self.output_renaming.items(): + for old_colname, new_colname in output_mapping: if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 296709c8..a61d1b54 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -27,7 +27,7 @@ from sparkdl.graph.builder import IsolatedSession import sparkdl.graph.utils as tfx -from sparkdl.transformers.tf_tensor import TFTransformer +from sparkdl.transformers.tf_tensor import TFTransformer, TFInputGraphBuilder from ..tests import SparkDLTestCase @@ -65,13 +65,14 @@ def test_build_from_tf_graph(self): out_ref = np.hstack(_results) # Apply the transform - transfomer = TFTransformer(tfGraph=graph, - inputMapping={ - 'vec': x - }, - outputMapping={ - z: 'outCol' - }) + transfomer = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromGraph(graph), + inputMapping={ + 'vec': x + }, + outputMapping={ + z: 'outCol' + }) final_df = transfomer.transform(analyzed_df) out_tgt = grab_df_arr(final_df, 'outCol') @@ -131,27 +132,27 @@ def test_build_from_saved_model(self): # Build the transformer from exported serving model # We are using signaures, thus must provide the keys - trans_with_sig = TFTransformer(exportDir=saved_model_dir, - signatureDefKey=serving_sigdef_key, - tagSet=serving_tag, - inputMapping={ - input_col: 'input_sig' - }, - outputMapping={ - 'output_sig': output_col - }) + trans_with_sig = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromSavedModelDir( + saved_model_dir, tag_set=serving_tag, signature_def_key=serving_sigdef_key), + inputMapping={ + input_col: 'input_sig' + }, + outputMapping={ + 'output_sig': output_col + }) # Build the transformer from exported serving model # We are not using signatures, thus must provide tensor/operation names - trans_no_sig = TFTransformer(exportDir=saved_model_dir, - signatureDefKey=None, - tagSet=serving_tag, - inputMapping={ - input_col: 'tnsrIn' - }, - outputMapping={ - 'tnsrOut': output_col - }) + trans_no_sig = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromSavedModelDir( + saved_model_dir, tag_set=serving_tag, signature_def_key=None), + inputMapping={ + input_col: 'tnsrIn' + }, + outputMapping={ + 'tnsrOut': output_col + }) df_trans_with_sig = trans_with_sig.transform(analyzed_df) df_trans_no_sig = trans_no_sig.transform(analyzed_df) @@ -193,13 +194,14 @@ def test_build_from_checkpoint(self): _results.append(sess.run(z, {x: arr})) out_ref = np.hstack(_results) - transformer = TFTransformer(tfCheckpointDir=model_ckpt_dir, - inputMapping={ - input_col: 'tnsrIn' - }, - outputMapping={ - 'tnsrOut': output_col - }) + transformer = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromCheckpointDir(model_ckpt_dir), + inputMapping={ + input_col: 'tnsrIn' + }, + outputMapping={ + 'tnsrOut': output_col + }) final_df = transformer.transform(analyzed_df) out_tgt = grab_df_arr(final_df, output_col) @@ -239,15 +241,16 @@ def test_multi_io(self): q_out_ref = np.hstack(q_out_ref) # Apply the transform - transfomer = TFTransformer(tfGraph=graph, - inputMapping={ - 'vec_x': x, - 'vec_y': y - }, - outputMapping={ - p: 'outcol_p', - q: 'outcol_q' - }) + transfomer = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromGraph(graph), + inputMapping={ + 'vec_x': x, + 'vec_y': y + }, + outputMapping={ + p: 'outcol_p', + q: 'outcol_q' + }) final_df = transfomer.transform(analyzed_df) p_out_tgt = grab_df_arr(final_df, 'outcol_p') q_out_tgt = grab_df_arr(final_df, 'outcol_q') @@ -303,13 +306,14 @@ def test_map_blocks_graph(self): arr_ref = grab_df_arr(final_df, output_col) # Using the Transformer - transformer = TFTransformer(tfGraph=gfn, - inputMapping={ - input_col: gfn.input_names[0] - }, - outputMapping={ - gfn.output_names[0]: output_col - }) + transformer = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromGraphDef(gfn.graph_def), + inputMapping={ + input_col: gfn.input_names[0] + }, + outputMapping={ + gfn.output_names[0]: output_col + }) transformed_df = transformer.transform(analyzed_df) arr_tgt = grab_df_arr(transformed_df, output_col) From 8c7d72e629673f5ce4182214e2ae3495d4160cf9 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 8 Sep 2017 21:06:50 -0700 Subject: [PATCH 04/23] docs and cleanup --- python/sparkdl/transformers/tf_tensor.py | 63 +++++++++++++++------ python/tests/transformers/tf_tensor_test.py | 6 +- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 54c05460..b655e85a 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -29,21 +29,37 @@ keyword_only, SparkDLTypeConverters, HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams) -__all__ = ['TFTransformer'] +__all__ = ['TFTransformer', 'TFInputGraphBuilder'] logger = logging.getLogger('sparkdl') +def _assert_set_incl(seq_small, seq_large, msg): + set_small = set(seq_small) + set_large = set(seq_large) + assert set_small <= set_large, \ + 'set not inclusive: {} => diff items {}'.format(msg, set_small - set_large) + class TFInputGraph(object): + """ + An opaque serializable object containing TensorFlow graph. + """ + # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. def __init__(self, graph_function, input_mapping, output_mapping): # GraphFunction self.graph_function = graph_function - # type: (str, str) list + + _assert_set_incl(input_mapping.values(), graph_function.input_names, 'input names') if isinstance(input_mapping, dict): - input_mapping = input_mapping.items() + input_mapping = list(input_mapping.items()) + assert isinstance(input_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(input_mapping)) self.input_mapping = sorted(input_mapping) - # type: (str, str) list + + _assert_set_incl(output_mapping.keys(), graph_function.output_names, 'output names') if isinstance(output_mapping, dict): - output_mapping = output_mapping.items() + output_mapping = list(output_mapping.items()) + assert isinstance(output_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(output_mapping)) self.output_mapping = sorted(output_mapping) class TFInputGraphBuilder(object): @@ -51,13 +67,18 @@ class TFInputGraphBuilder(object): Create a builder function so as to be able to compile graph for inference. The actual compilation will be done at the time when the inputs (feeds) and outputs (fetches) are provided. + :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. + If the meta_graph contains a `signature_def`, return it. """ def __init__(self, graph_import_fn): - # Return graph_def, input_mapping, output_mapping + # Return signature_def if the underlying graph contains one self.graph_import_fn = graph_import_fn def build(self, input_mapping, output_mapping): - + """ + Create a serializable TensorFlow graph representation + :param input_mapping: dict, from input DataFrame column name to internal graph name. + """ with IsolatedSession() as issn: sig_def = self.graph_import_fn(issn.sess) @@ -95,31 +116,40 @@ def build(self, input_mapping, output_mapping): @classmethod def fromGraph(cls, graph): + """ + Construct a TFInputGraphBuilder from a in memory tf.Graph object + """ assert isinstance(graph, tf.Graph), \ ('expect tf.Graph type but got', type(graph)) def import_graph_fn(sess): - #graph.finalize() gdef = graph.as_graph_def(add_shapes=True) - tf.import_graph_def(gdef, name='') + with sess.as_default(): + tf.import_graph_def(gdef, name='') return None # no meta_graph_def return cls(import_graph_fn) @classmethod def fromGraphDef(cls, graph_def): + """ + Construct a TFInputGraphBuilder from a tf.GraphDef object + """ assert isinstance(graph_def, tf.GraphDef), \ ('expect tf.GraphDef type but got', type(graph_def)) def import_graph_fn(sess): - tf.import_graph_def(graph_def, name='') + with sess.as_default(): + tf.import_graph_def(graph_def, name='') return None return cls(import_graph_fn) @classmethod - def fromCheckpointDir(cls, checkpoint_dir, signature_def_key=None): - + def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a model checkpoint + """ def import_graph_fn(sess): # Load checkpoint and import the graph ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) @@ -137,8 +167,10 @@ def import_graph_fn(sess): return cls(import_graph_fn) @classmethod - def fromSavedModelDir(cls, saved_model_dir, tag_set, signature_def_key=None): - + def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a SavedModel + """ def import_graph_fn(sess): tag_sets = tag_set.split(',') meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) @@ -172,8 +204,6 @@ def __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfH """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs - gin = tfInputGraph.build(inputMapping, outputMapping) - kwargs['tfInputGraph'] = gin self.setParams(**kwargs) @keyword_only @@ -183,6 +213,7 @@ def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tf """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs + kwargs['tfInputGraph'] = tfInputGraph.build(inputMapping, outputMapping) return self._set(**kwargs) def _transform(self, dataset): diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index a61d1b54..d9360c2d 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -133,7 +133,7 @@ def test_build_from_saved_model(self): # Build the transformer from exported serving model # We are using signaures, thus must provide the keys trans_with_sig = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromSavedModelDir( + tfInputGraph=TFInputGraphBuilder.fromSavedModel( saved_model_dir, tag_set=serving_tag, signature_def_key=serving_sigdef_key), inputMapping={ input_col: 'input_sig' @@ -145,7 +145,7 @@ def test_build_from_saved_model(self): # Build the transformer from exported serving model # We are not using signatures, thus must provide tensor/operation names trans_no_sig = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromSavedModelDir( + tfInputGraph=TFInputGraphBuilder.fromSavedModel( saved_model_dir, tag_set=serving_tag, signature_def_key=None), inputMapping={ input_col: 'tnsrIn' @@ -195,7 +195,7 @@ def test_build_from_checkpoint(self): out_ref = np.hstack(_results) transformer = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromCheckpointDir(model_ckpt_dir), + tfInputGraph=TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir), inputMapping={ input_col: 'tnsrIn' }, From eb543c6cada6c757691789401f2c8810b32706a3 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 9 Sep 2017 20:43:42 -0700 Subject: [PATCH 05/23] using tensorflow API instead of our utilities --- python/sparkdl/transformers/tf_tensor.py | 49 +++++++++++------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index b655e85a..c053c0e6 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -20,10 +20,7 @@ import tensorframes as tfs from pyspark.ml import Transformer -from pyspark.ml.param import Param, Params -from pyspark.sql.functions import udf -from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx from sparkdl.transformers.param import ( keyword_only, SparkDLTypeConverters, HasInputMapping, @@ -33,29 +30,21 @@ logger = logging.getLogger('sparkdl') -def _assert_set_incl(seq_small, seq_large, msg): - set_small = set(seq_small) - set_large = set(seq_large) - assert set_small <= set_large, \ - 'set not inclusive: {} => diff items {}'.format(msg, set_small - set_large) - class TFInputGraph(object): """ An opaque serializable object containing TensorFlow graph. """ # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. - def __init__(self, graph_function, input_mapping, output_mapping): - # GraphFunction - self.graph_function = graph_function + def __init__(self, graph_def, input_mapping, output_mapping): + # tf.GraphDef + self.graph_def = graph_def - _assert_set_incl(input_mapping.values(), graph_function.input_names, 'input names') if isinstance(input_mapping, dict): input_mapping = list(input_mapping.items()) assert isinstance(input_mapping, list), \ "output mapping must be a list of strings, found type {}".format(type(input_mapping)) self.input_mapping = sorted(input_mapping) - _assert_set_incl(output_mapping.keys(), graph_function.output_names, 'output names') if isinstance(output_mapping, dict): output_mapping = list(output_mapping.items()) assert isinstance(output_mapping, list), \ @@ -79,19 +68,18 @@ def build(self, input_mapping, output_mapping): Create a serializable TensorFlow graph representation :param input_mapping: dict, from input DataFrame column name to internal graph name. """ - with IsolatedSession() as issn: - sig_def = self.graph_import_fn(issn.sess) + graph = tf.Graph() + with tf.Session(graph=graph) as sess: + sig_def = self.graph_import_fn(sess) # Append feeds and input mapping - feeds = [] _input_mapping = {} for input_colname, tnsr_or_sig in input_mapping.items(): if sig_def: tnsr = sig_def.inputs[tnsr_or_sig].name else: tnsr = tnsr_or_sig - _input_mapping[input_colname] = tfx.op_name(issn.graph, tnsr) - feeds.append(tfx.get_tensor(issn.graph, tnsr)) + _input_mapping[input_colname] = tfx.op_name(graph, tnsr) input_mapping = _input_mapping # Append fetches and output mapping @@ -105,14 +93,14 @@ def build(self, input_mapping, output_mapping): tnsr = sig_def.outputs[tnsr_or_sig].name else: tnsr = tnsr_or_sig - fetches.append(tfx.get_tensor(issn.graph, tnsr)) - tf_output_colname = tfx.op_name(issn.graph, tnsr) + fetches.append(tfx.get_tensor(graph, tnsr)) + tf_output_colname = tfx.op_name(graph, tnsr) _output_mapping[tf_output_colname] = requested_colname output_mapping = _output_mapping - gfn = issn.asGraphFunction(feeds, fetches, strip_and_freeze=True) + gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - return TFInputGraph(gfn, input_mapping, output_mapping) + return TFInputGraph(gdef, input_mapping, output_mapping) @classmethod def fromGraph(cls, graph): @@ -221,10 +209,19 @@ def _transform(self, dataset): input_mapping = gin.input_mapping output_mapping = gin.output_mapping - with IsolatedSession() as issn: + graph = tf.Graph() + with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) - _, fetches = issn.importGraphFunction(gin.graph_function, prefix='') - feed_dict = dict([(tnsr_name, col_name) for col_name, tnsr_name in input_mapping]) + + out_tnsr_op_names = [tfx.as_op_name(tnsr_op_name) + for tnsr_op_name, _ in output_mapping] + tf.import_graph_def(graph_def=gin.graph_def, + name='', + return_elements=out_tnsr_op_names) + + feed_dict = dict((tfx.op_name(graph, tnsr_op_name), col_name) + for col_name, tnsr_op_name in input_mapping) + fetches = [tfx.get_tensor(graph, tnsr_op_name) for tnsr_op_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) From 4743bb9818517b91d8eb3b0444bad8e86a25b796 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 9 Sep 2017 20:52:50 -0700 Subject: [PATCH 06/23] automatic type conversion --- python/sparkdl/transformers/param.py | 18 ++- python/sparkdl/transformers/tf_tensor.py | 149 +------------------- python/sparkdl/transformers/utils.py | 148 ++++++++++++++++++- python/tests/transformers/tf_tensor_test.py | 54 +++---- 4 files changed, 188 insertions(+), 181 deletions(-) diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index 90edbad0..827b7234 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -29,6 +29,7 @@ from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx +from sparkdl.transformers.utils import TFInputGraph, TFInputGraphBuilder """ Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. @@ -115,13 +116,16 @@ def toTFGraph(value): @staticmethod def toTFInputGraph(value): - return value - # if isinstance(value, tf.Graph): - # return value.as_graph_def(add_shapes=True) - # elif isinstance(value, tf.GraphDef): - # return value - # else: - # raise TypeError("Could not convert %s to TFInputGraph" % type(value)) + if isinstance(value, TFInputGraph): + return value + elif isinstance(value, TFInputGraphBuilder): + return value + elif isinstance(value, tf.Graph): + return TFInputGraphBuilder.fromGraph(value) + elif isinstance(value, tf.GraphDef): + return TFInputGraphBuilder.fromGraphDef(value) + else: + raise TypeError("Could not convert %s to TFInputGraph" % type(value)) @staticmethod def asColumnToTensorMap(value): diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index c053c0e6..9aba6d88 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -15,13 +15,13 @@ from __future__ import absolute_import, division, print_function import logging -import numpy as np import tensorflow as tf import tensorframes as tfs from pyspark.ml import Transformer import sparkdl.graph.utils as tfx +from sparkdl.transformers.utils import TFInputGraphBuilder from sparkdl.transformers.param import ( keyword_only, SparkDLTypeConverters, HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams) @@ -30,149 +30,6 @@ logger = logging.getLogger('sparkdl') -class TFInputGraph(object): - """ - An opaque serializable object containing TensorFlow graph. - """ - # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. - def __init__(self, graph_def, input_mapping, output_mapping): - # tf.GraphDef - self.graph_def = graph_def - - if isinstance(input_mapping, dict): - input_mapping = list(input_mapping.items()) - assert isinstance(input_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(input_mapping)) - self.input_mapping = sorted(input_mapping) - - if isinstance(output_mapping, dict): - output_mapping = list(output_mapping.items()) - assert isinstance(output_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(output_mapping)) - self.output_mapping = sorted(output_mapping) - -class TFInputGraphBuilder(object): - """ - Create a builder function so as to be able to compile graph for inference. - The actual compilation will be done at the time when the - inputs (feeds) and outputs (fetches) are provided. - :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. - If the meta_graph contains a `signature_def`, return it. - """ - def __init__(self, graph_import_fn): - # Return signature_def if the underlying graph contains one - self.graph_import_fn = graph_import_fn - - def build(self, input_mapping, output_mapping): - """ - Create a serializable TensorFlow graph representation - :param input_mapping: dict, from input DataFrame column name to internal graph name. - """ - graph = tf.Graph() - with tf.Session(graph=graph) as sess: - sig_def = self.graph_import_fn(sess) - - # Append feeds and input mapping - _input_mapping = {} - for input_colname, tnsr_or_sig in input_mapping.items(): - if sig_def: - tnsr = sig_def.inputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - _input_mapping[input_colname] = tfx.op_name(graph, tnsr) - input_mapping = _input_mapping - - # Append fetches and output mapping - fetches = [] - _output_mapping = {} - # By default the output columns will have the name of their - # corresponding `tf.Graph` operation names. - # We have to convert them to the user specified output names - for tnsr_or_sig, requested_colname in output_mapping.items(): - if sig_def: - tnsr = sig_def.outputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - fetches.append(tfx.get_tensor(graph, tnsr)) - tf_output_colname = tfx.op_name(graph, tnsr) - _output_mapping[tf_output_colname] = requested_colname - output_mapping = _output_mapping - - gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - - return TFInputGraph(gdef, input_mapping, output_mapping) - - @classmethod - def fromGraph(cls, graph): - """ - Construct a TFInputGraphBuilder from a in memory tf.Graph object - """ - assert isinstance(graph, tf.Graph), \ - ('expect tf.Graph type but got', type(graph)) - - def import_graph_fn(sess): - gdef = graph.as_graph_def(add_shapes=True) - with sess.as_default(): - tf.import_graph_def(gdef, name='') - return None # no meta_graph_def - - return cls(import_graph_fn) - - @classmethod - def fromGraphDef(cls, graph_def): - """ - Construct a TFInputGraphBuilder from a tf.GraphDef object - """ - assert isinstance(graph_def, tf.GraphDef), \ - ('expect tf.GraphDef type but got', type(graph_def)) - - def import_graph_fn(sess): - with sess.as_default(): - tf.import_graph_def(graph_def, name='') - return None - - return cls(import_graph_fn) - - @classmethod - def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): - """ - Construct a TFInputGraphBuilder from a model checkpoint - """ - def import_graph_fn(sess): - # Load checkpoint and import the graph - ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) - saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) - saver.restore(sess, ckpt_path) - meta_graph_def = saver.export_meta_graph(clear_devices=True) - - sig_def = None - if signature_def_key is not None: - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, signature_def_key) - - return sig_def - - return cls(import_graph_fn) - - @classmethod - def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): - """ - Construct a TFInputGraphBuilder from a SavedModel - """ - def import_graph_fn(sess): - tag_sets = tag_set.split(',') - meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) - - sig_def = None - if signature_def_key is not None: - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, signature_def_key) - - return sig_def - - return cls(import_graph_fn) - - class TFTransformer(Transformer, HasTFInputGraph, HasTFHParams, HasInputMapping, HasOutputMapping): """ Applies the TensorFlow graph to the array column in DataFrame. @@ -201,7 +58,9 @@ def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tf """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs - kwargs['tfInputGraph'] = tfInputGraph.build(inputMapping, outputMapping) + _maybe_gin = SparkDLTypeConverters.toTFInputGraph(tfInputGraph) + if isinstance(_maybe_gin, TFInputGraphBuilder): + kwargs['tfInputGraph'] = _maybe_gin.build(inputMapping, outputMapping) return self._set(**kwargs) def _transform(self, dataset): diff --git a/python/sparkdl/transformers/utils.py b/python/sparkdl/transformers/utils.py index bb20ce2e..04ff3eff 100644 --- a/python/sparkdl/transformers/utils.py +++ b/python/sparkdl/transformers/utils.py @@ -15,11 +15,9 @@ import tensorflow as tf -from pyspark.ml.param import TypeConverters - +import sparkdl.graph.utils as tfx from sparkdl.image.imageIO import imageType - # image stuff IMAGE_INPUT_PLACEHOLDER_NAME = "sparkdl_image_input" @@ -36,3 +34,147 @@ class ImageNetConstants: class InceptionV3Constants: INPUT_SHAPE = (299, 299) NUM_OUTPUT_FEATURES = 131072 + + +class TFInputGraph(object): + """ + An opaque serializable object containing TensorFlow graph. + """ + # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. + def __init__(self, graph_def, input_mapping, output_mapping): + # tf.GraphDef + self.graph_def = graph_def + + if isinstance(input_mapping, dict): + input_mapping = list(input_mapping.items()) + assert isinstance(input_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(input_mapping)) + self.input_mapping = sorted(input_mapping) + + if isinstance(output_mapping, dict): + output_mapping = list(output_mapping.items()) + assert isinstance(output_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(output_mapping)) + self.output_mapping = sorted(output_mapping) + + +class TFInputGraphBuilder(object): + """ + Create a builder function so as to be able to compile graph for inference. + The actual compilation will be done at the time when the + inputs (feeds) and outputs (fetches) are provided. + :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. + If the meta_graph contains a `signature_def`, return it. + """ + def __init__(self, graph_import_fn): + # Return signature_def if the underlying graph contains one + self.graph_import_fn = graph_import_fn + + def build(self, input_mapping, output_mapping): + """ + Create a serializable TensorFlow graph representation + :param input_mapping: dict, from input DataFrame column name to internal graph name. + """ + graph = tf.Graph() + with tf.Session(graph=graph) as sess: + sig_def = self.graph_import_fn(sess) + + # Append feeds and input mapping + _input_mapping = {} + for input_colname, tnsr_or_sig in input_mapping.items(): + if sig_def: + tnsr = sig_def.inputs[tnsr_or_sig].name + else: + tnsr = tnsr_or_sig + _input_mapping[input_colname] = tfx.op_name(graph, tnsr) + input_mapping = _input_mapping + + # Append fetches and output mapping + fetches = [] + _output_mapping = {} + # By default the output columns will have the name of their + # corresponding `tf.Graph` operation names. + # We have to convert them to the user specified output names + for tnsr_or_sig, requested_colname in output_mapping.items(): + if sig_def: + tnsr = sig_def.outputs[tnsr_or_sig].name + else: + tnsr = tnsr_or_sig + fetches.append(tfx.get_tensor(graph, tnsr)) + tf_output_colname = tfx.op_name(graph, tnsr) + _output_mapping[tf_output_colname] = requested_colname + output_mapping = _output_mapping + + gdef = tfx.strip_and_freeze_until(fetches, graph, sess) + + return TFInputGraph(gdef, input_mapping, output_mapping) + + @classmethod + def fromGraph(cls, graph): + """ + Construct a TFInputGraphBuilder from a in memory tf.Graph object + """ + assert isinstance(graph, tf.Graph), \ + ('expect tf.Graph type but got', type(graph)) + + def import_graph_fn(sess): + gdef = graph.as_graph_def(add_shapes=True) + with sess.as_default(): + tf.import_graph_def(gdef, name='') + return None # no meta_graph_def + + return cls(import_graph_fn) + + @classmethod + def fromGraphDef(cls, graph_def): + """ + Construct a TFInputGraphBuilder from a tf.GraphDef object + """ + assert isinstance(graph_def, tf.GraphDef), \ + ('expect tf.GraphDef type but got', type(graph_def)) + + def import_graph_fn(sess): + with sess.as_default(): + tf.import_graph_def(graph_def, name='') + return None + + return cls(import_graph_fn) + + @classmethod + def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a model checkpoint + """ + def import_graph_fn(sess): + # Load checkpoint and import the graph + ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) + saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) + saver.restore(sess, ckpt_path) + meta_graph_def = saver.export_meta_graph(clear_devices=True) + + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) + + @classmethod + def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a SavedModel + """ + def import_graph_fn(sess): + tag_sets = tag_set.split(',') + meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) + + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index d9360c2d..85df23a7 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -65,18 +65,19 @@ def test_build_from_tf_graph(self): out_ref = np.hstack(_results) # Apply the transform - transfomer = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromGraph(graph), - inputMapping={ - 'vec': x - }, - outputMapping={ - z: 'outCol' - }) - final_df = transfomer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, 'outCol') - - self.assertTrue(np.allclose(out_ref, out_tgt)) + gin_from_graph = TFInputGraphBuilder.fromGraph(graph) + for gin in [gin_from_graph, graph]: + transfomer = TFTransformer( + tfInputGraph=TFInputGraphBuilder.fromGraph(graph), + inputMapping={ + 'vec': x + }, + outputMapping={ + z: 'outCol' + }) + final_df = transfomer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, 'outCol') + self.assertTrue(np.allclose(out_ref, out_tgt)) def test_build_from_saved_model(self): @@ -258,7 +259,7 @@ def test_multi_io(self): self.assertTrue(np.allclose(p_out_ref, p_out_tgt)) self.assertTrue(np.allclose(q_out_ref, q_out_tgt)) - def test_map_blocks_graph(self): + def test_mixed_keras_graph(self): vec_size = 17 num_vecs = 137 @@ -306,16 +307,17 @@ def test_map_blocks_graph(self): arr_ref = grab_df_arr(final_df, output_col) # Using the Transformer - transformer = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromGraphDef(gfn.graph_def), - inputMapping={ - input_col: gfn.input_names[0] - }, - outputMapping={ - gfn.output_names[0]: output_col - }) - transformed_df = transformer.transform(analyzed_df) - - arr_tgt = grab_df_arr(transformed_df, output_col) - - self.assertTrue(np.allclose(arr_ref, arr_tgt)) + gin_from_gdef = TFInputGraphBuilder.fromGraphDef(gfn.graph_def) + for gin in [gin_from_gdef, gfn.graph_def]: + transformer = TFTransformer( + tfInputGraph=gin, + inputMapping={ + input_col: gfn.input_names[0] + }, + outputMapping={ + gfn.output_names[0]: output_col + }) + + transformed_df = transformer.transform(analyzed_df) + arr_tgt = grab_df_arr(transformed_df, output_col) + self.assertTrue(np.allclose(arr_ref, arr_tgt)) From 622c7884a9f670f7af33cbdebaf37d7447463266 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 9 Sep 2017 21:27:50 -0700 Subject: [PATCH 07/23] cleanup --- python/sparkdl/__init__.py | 8 ++++---- python/sparkdl/transformers/tf_tensor.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sparkdl/__init__.py b/python/sparkdl/__init__.py index aa15059a..b1a921cb 100644 --- a/python/sparkdl/__init__.py +++ b/python/sparkdl/__init__.py @@ -17,11 +17,11 @@ from .transformers.keras_image import KerasImageFileTransformer from .transformers.named_image import DeepImagePredictor, DeepImageFeaturizer from .transformers.tf_image import TFImageTransformer -from .transformers.utils import imageInputPlaceholder +from .transformers.tf_tensor import TFTransformer +from .transformers.utils import imageInputPlaceholder, TFInputGraphBuilder __all__ = [ 'imageSchema', 'imageType', 'readImages', - 'TFImageTransformer', - 'DeepImagePredictor', 'DeepImageFeaturizer', - 'KerasImageFileTransformer', + 'TFImageTransformer', 'TFInputGraphBuilder', 'TFTransformer', + 'DeepImagePredictor', 'DeepImageFeaturizer', 'KerasImageFileTransformer', 'imageInputPlaceholder'] diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 9aba6d88..255c264a 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -26,7 +26,7 @@ keyword_only, SparkDLTypeConverters, HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams) -__all__ = ['TFTransformer', 'TFInputGraphBuilder'] +__all__ = ['TFTransformer'] logger = logging.getLogger('sparkdl') From 07f1cec524b9ba604692be8e7bb7c2b12c7eb54c Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 11 Sep 2017 16:09:22 -0700 Subject: [PATCH 08/23] PR comments 1. Move `InputGraph` to its module. --- python/sparkdl/graph/input.py | 166 +++++++++++++++++++++++++++ python/sparkdl/transformers/utils.py | 145 ----------------------- 2 files changed, 166 insertions(+), 145 deletions(-) create mode 100644 python/sparkdl/graph/input.py diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py new file mode 100644 index 00000000..ea0d7502 --- /dev/null +++ b/python/sparkdl/graph/input.py @@ -0,0 +1,166 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import tensorflow as tf + +import sparkdl.graph.utils as tfx + + +class TFInputGraph(object): + """ + An opaque serializable object containing TensorFlow graph. + """ + + # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. + def __init__(self, graph_def, input_mapping, output_mapping): + # tf.GraphDef + self.graph_def = graph_def + + if isinstance(input_mapping, dict): + input_mapping = list(input_mapping.items()) + assert isinstance(input_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(input_mapping)) + self.input_mapping = sorted(input_mapping) + + if isinstance(output_mapping, dict): + output_mapping = list(output_mapping.items()) + assert isinstance(output_mapping, list), \ + "output mapping must be a list of strings, found type {}".format(type(output_mapping)) + self.output_mapping = sorted(output_mapping) + + +class TFInputGraphBuilder(object): + """ + Create a builder function so as to be able to compile graph for inference. + The actual compilation will be done at the time when the + inputs (feeds) and outputs (fetches) are provided. + :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. + If the meta_graph contains a `signature_def`, return it. + """ + + def __init__(self, graph_import_fn): + # Return signature_def if the underlying graph contains one + self.graph_import_fn = graph_import_fn + + def build(self, input_mapping, output_mapping): + """ + Create a serializable TensorFlow graph representation + :param input_mapping: dict, from input DataFrame column name to internal graph name. + """ + graph = tf.Graph() + with tf.Session(graph=graph) as sess: + sig_def = self.graph_import_fn(sess) + + # Append feeds and input mapping + _input_mapping = {} + for input_colname, tnsr_or_sig in input_mapping.items(): + if sig_def: + tnsr = sig_def.inputs[tnsr_or_sig].name + else: + tnsr = tnsr_or_sig + _input_mapping[input_colname] = tfx.op_name(graph, tnsr) + input_mapping = _input_mapping + + # Append fetches and output mapping + fetches = [] + _output_mapping = {} + # By default the output columns will have the name of their + # corresponding `tf.Graph` operation names. + # We have to convert them to the user specified output names + for tnsr_or_sig, requested_colname in output_mapping.items(): + if sig_def: + tnsr = sig_def.outputs[tnsr_or_sig].name + else: + tnsr = tnsr_or_sig + fetches.append(tfx.get_tensor(graph, tnsr)) + tf_output_colname = tfx.op_name(graph, tnsr) + _output_mapping[tf_output_colname] = requested_colname + output_mapping = _output_mapping + + gdef = tfx.strip_and_freeze_until(fetches, graph, sess) + + return TFInputGraph(gdef, input_mapping, output_mapping) + + @classmethod + def fromGraph(cls, graph): + """ + Construct a TFInputGraphBuilder from a in memory tf.Graph object + """ + assert isinstance(graph, tf.Graph), \ + ('expect tf.Graph type but got', type(graph)) + + def import_graph_fn(sess): + gdef = graph.as_graph_def(add_shapes=True) + with sess.as_default(): + tf.import_graph_def(gdef, name='') + return None # no meta_graph_def + + return cls(import_graph_fn) + + @classmethod + def fromGraphDef(cls, graph_def): + """ + Construct a TFInputGraphBuilder from a tf.GraphDef object + """ + assert isinstance(graph_def, tf.GraphDef), \ + ('expect tf.GraphDef type but got', type(graph_def)) + + def import_graph_fn(sess): + with sess.as_default(): + tf.import_graph_def(graph_def, name='') + return None + + return cls(import_graph_fn) + + @classmethod + def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a model checkpoint + """ + + def import_graph_fn(sess): + # Load checkpoint and import the graph + ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) + saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) + saver.restore(sess, ckpt_path) + meta_graph_def = saver.export_meta_graph(clear_devices=True) + + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) + + @classmethod + def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): + """ + Construct a TFInputGraphBuilder from a SavedModel + """ + + def import_graph_fn(sess): + tag_sets = tag_set.split(',') + meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) + + sig_def = None + if signature_def_key is not None: + sig_def = tf.contrib.saved_model.get_signature_def_by_key( + meta_graph_def, signature_def_key) + + return sig_def + + return cls(import_graph_fn) diff --git a/python/sparkdl/transformers/utils.py b/python/sparkdl/transformers/utils.py index 04ff3eff..64373e6a 100644 --- a/python/sparkdl/transformers/utils.py +++ b/python/sparkdl/transformers/utils.py @@ -15,7 +15,6 @@ import tensorflow as tf -import sparkdl.graph.utils as tfx from sparkdl.image.imageIO import imageType # image stuff @@ -34,147 +33,3 @@ class ImageNetConstants: class InceptionV3Constants: INPUT_SHAPE = (299, 299) NUM_OUTPUT_FEATURES = 131072 - - -class TFInputGraph(object): - """ - An opaque serializable object containing TensorFlow graph. - """ - # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. - def __init__(self, graph_def, input_mapping, output_mapping): - # tf.GraphDef - self.graph_def = graph_def - - if isinstance(input_mapping, dict): - input_mapping = list(input_mapping.items()) - assert isinstance(input_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(input_mapping)) - self.input_mapping = sorted(input_mapping) - - if isinstance(output_mapping, dict): - output_mapping = list(output_mapping.items()) - assert isinstance(output_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(output_mapping)) - self.output_mapping = sorted(output_mapping) - - -class TFInputGraphBuilder(object): - """ - Create a builder function so as to be able to compile graph for inference. - The actual compilation will be done at the time when the - inputs (feeds) and outputs (fetches) are provided. - :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. - If the meta_graph contains a `signature_def`, return it. - """ - def __init__(self, graph_import_fn): - # Return signature_def if the underlying graph contains one - self.graph_import_fn = graph_import_fn - - def build(self, input_mapping, output_mapping): - """ - Create a serializable TensorFlow graph representation - :param input_mapping: dict, from input DataFrame column name to internal graph name. - """ - graph = tf.Graph() - with tf.Session(graph=graph) as sess: - sig_def = self.graph_import_fn(sess) - - # Append feeds and input mapping - _input_mapping = {} - for input_colname, tnsr_or_sig in input_mapping.items(): - if sig_def: - tnsr = sig_def.inputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - _input_mapping[input_colname] = tfx.op_name(graph, tnsr) - input_mapping = _input_mapping - - # Append fetches and output mapping - fetches = [] - _output_mapping = {} - # By default the output columns will have the name of their - # corresponding `tf.Graph` operation names. - # We have to convert them to the user specified output names - for tnsr_or_sig, requested_colname in output_mapping.items(): - if sig_def: - tnsr = sig_def.outputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - fetches.append(tfx.get_tensor(graph, tnsr)) - tf_output_colname = tfx.op_name(graph, tnsr) - _output_mapping[tf_output_colname] = requested_colname - output_mapping = _output_mapping - - gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - - return TFInputGraph(gdef, input_mapping, output_mapping) - - @classmethod - def fromGraph(cls, graph): - """ - Construct a TFInputGraphBuilder from a in memory tf.Graph object - """ - assert isinstance(graph, tf.Graph), \ - ('expect tf.Graph type but got', type(graph)) - - def import_graph_fn(sess): - gdef = graph.as_graph_def(add_shapes=True) - with sess.as_default(): - tf.import_graph_def(gdef, name='') - return None # no meta_graph_def - - return cls(import_graph_fn) - - @classmethod - def fromGraphDef(cls, graph_def): - """ - Construct a TFInputGraphBuilder from a tf.GraphDef object - """ - assert isinstance(graph_def, tf.GraphDef), \ - ('expect tf.GraphDef type but got', type(graph_def)) - - def import_graph_fn(sess): - with sess.as_default(): - tf.import_graph_def(graph_def, name='') - return None - - return cls(import_graph_fn) - - @classmethod - def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): - """ - Construct a TFInputGraphBuilder from a model checkpoint - """ - def import_graph_fn(sess): - # Load checkpoint and import the graph - ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) - saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) - saver.restore(sess, ckpt_path) - meta_graph_def = saver.export_meta_graph(clear_devices=True) - - sig_def = None - if signature_def_key is not None: - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, signature_def_key) - - return sig_def - - return cls(import_graph_fn) - - @classmethod - def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): - """ - Construct a TFInputGraphBuilder from a SavedModel - """ - def import_graph_fn(sess): - tag_sets = tag_set.split(',') - meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) - - sig_def = None - if signature_def_key is not None: - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, signature_def_key) - - return sig_def - - return cls(import_graph_fn) From 692b0ebeaacbcfaff22c3205e9a55df3256c5cac Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 11 Sep 2017 20:49:06 -0700 Subject: [PATCH 09/23] (WIP) address comments --- python/sparkdl/__init__.py | 4 +- python/sparkdl/graph/input.py | 57 +++++++++++++--- python/sparkdl/transformers/param.py | 56 +++++++++------ python/sparkdl/transformers/tf_tensor.py | 14 ++-- python/tests/transformers/tf_tensor_test.py | 75 ++++++++++++++++----- 5 files changed, 150 insertions(+), 56 deletions(-) diff --git a/python/sparkdl/__init__.py b/python/sparkdl/__init__.py index b1a921cb..c5c55ff3 100644 --- a/python/sparkdl/__init__.py +++ b/python/sparkdl/__init__.py @@ -13,12 +13,14 @@ # limitations under the License. # +from .graph.input import TFInputGraphBuilder from .image.imageIO import imageSchema, imageType, readImages from .transformers.keras_image import KerasImageFileTransformer from .transformers.named_image import DeepImagePredictor, DeepImageFeaturizer from .transformers.tf_image import TFImageTransformer from .transformers.tf_tensor import TFTransformer -from .transformers.utils import imageInputPlaceholder, TFInputGraphBuilder +from .transformers.utils import imageInputPlaceholder + __all__ = [ 'imageSchema', 'imageType', 'readImages', diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index ea0d7502..c495697d 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -14,13 +14,18 @@ # import tensorflow as tf +from tensorflow.core.protobuf import meta_graph_pb2 import sparkdl.graph.utils as tfx +__all__ = ["TFInputGraphBuilder", "get_params_from_checkpoint", "get_params_from_saved_model"] + class TFInputGraph(object): """ An opaque serializable object containing TensorFlow graph. + + [WARNING] This class should not be called by any user code. """ # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. @@ -41,6 +46,28 @@ def __init__(self, graph_def, input_mapping, output_mapping): self.output_mapping = sorted(output_mapping) +def _get_params_from(gin_builder, input_mapping, output_mapping): + gin = gin_builder.build(input_mapping, output_mapping) + imap = dict(gin.input_mapping) + assert len(imap) == len(gin.input_mapping) + omap = dict(gin.output_mapping) + assert len(omap) == len(gin.output_mapping) + return gin.graph_def, imap, omap + + +def get_params_from_checkpoint(checkpoint_dir, signature_def_key, input_mapping, output_mapping): + assert signature_def_key is not None + gin_builder = TFInputGraphBuilder.fromCheckpoint(checkpoint_dir, signature_def_key) + return _get_params_from(gin_builder, input_mapping, output_mapping) + + +def get_params_from_saved_model(saved_model_dir, tag_set, signature_def_key, input_mapping, + output_mapping): + assert signature_def_key is not None + gin_builder = TFInputGraphBuilder.fromSavedModel(saved_model_dir, tag_set, signature_def_key) + return _get_params_from(gin_builder, input_mapping, output_mapping) + + class TFInputGraphBuilder(object): """ Create a builder function so as to be able to compile graph for inference. @@ -65,7 +92,9 @@ def build(self, input_mapping, output_mapping): # Append feeds and input mapping _input_mapping = {} - for input_colname, tnsr_or_sig in input_mapping.items(): + if isinstance(input_mapping, dict): + input_mapping = input_mapping.items() + for input_colname, tnsr_or_sig in input_mapping: if sig_def: tnsr = sig_def.inputs[tnsr_or_sig].name else: @@ -79,7 +108,9 @@ def build(self, input_mapping, output_mapping): # By default the output columns will have the name of their # corresponding `tf.Graph` operation names. # We have to convert them to the user specified output names - for tnsr_or_sig, requested_colname in output_mapping.items(): + if isinstance(output_mapping, dict): + output_mapping = output_mapping.items() + for tnsr_or_sig, requested_colname in output_mapping: if sig_def: tnsr = sig_def.outputs[tnsr_or_sig].name else: @@ -132,15 +163,21 @@ def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): def import_graph_fn(sess): # Load checkpoint and import the graph - ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) - saver = tf.train.import_meta_graph("{}.meta".format(ckpt_path), clear_devices=True) - saver.restore(sess, ckpt_path) - meta_graph_def = saver.export_meta_graph(clear_devices=True) + with sess.as_default(): + ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) - sig_def = None - if signature_def_key is not None: - sig_def = tf.contrib.saved_model.get_signature_def_by_key( - meta_graph_def, signature_def_key) + # NOTE(phi-dbq): we must manually load meta_graph_def to get the signature_def + meta_graph_def = meta_graph_pb2.MetaGraphDef() + with open("{}.meta".format(ckpt_path), 'rb') as fin: + meta_graph_def.ParseFromString(fin.read()) + + saver = tf.train.import_meta_graph(meta_graph_def, clear_devices=True) + saver.restore(sess, ckpt_path) + + sig_def = None + if signature_def_key is not None: + sig_def = meta_graph_def.signature_def[signature_def_key] + # TODO: check if sig_def is valid return sig_def diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index 827b7234..233de707 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - """ Some parts are copied from pyspark.ml.param.shared and some are complementary to pyspark.ml.param. The copy is due to some useful pyspark fns/classes being @@ -29,12 +28,12 @@ from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx -from sparkdl.transformers.utils import TFInputGraph, TFInputGraphBuilder - +from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder """ Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. """ + def keyword_only(func): """ A decorator that forces keyword arguments in the wrapped method @@ -42,12 +41,14 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper @@ -56,8 +57,8 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - inputCol = Param(Params._dummy(), "inputCol", "input column name.", - typeConverter=TypeConverters.toString) + inputCol = Param( + Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) def setInputCol(self, value): """ @@ -77,8 +78,8 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - outputCol = Param(Params._dummy(), "outputCol", "output column name.", - typeConverter=TypeConverters.toString) + outputCol = Param( + Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) def __init__(self): super(HasOutputCol, self).__init__() @@ -100,8 +101,9 @@ def getOutputCol(self): """ TensorFlow Specific Parameters """ -class SparkDLTypeConverters(object): + +class SparkDLTypeConverters(object): @staticmethod def toTFGraph(value): if isinstance(value, tf.Graph): @@ -165,17 +167,22 @@ def converter(value): return value else: raise TypeError("%s %s is not in the supported list." % type(value), str(value)) + return converter + # New in sparkdl + class HasOutputMapping(Params): """ - Mixin for param outputMapping: ordered list of ('outputTensorName', 'outputColName') pairs + Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs """ - outputMapping = Param(Params._dummy(), "outputMapping", - "Mapping output :class:`tf.Tensor` objects to DataFrame column names", - typeConverter=SparkDLTypeConverters.asTensorToColumnMap) + outputMapping = Param( + Params._dummy(), + "outputMapping", + "Mapping output :class:`tf.Operation` names to DataFrame column names", + typeConverter=SparkDLTypeConverters.asTensorToColumnMap) def setOutputMapping(self, value): return self._set(outputMapping=value) @@ -186,11 +193,13 @@ def getOutputMapping(self): class HasInputMapping(Params): """ - Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorName') pairs + Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs """ - inputMapping = Param(Params._dummy(), "inputMapping", - "Mapping input DataFrame column names to :class:`tf.Tensor` objects", - typeConverter=SparkDLTypeConverters.asColumnToTensorMap) + inputMapping = Param( + Params._dummy(), + "inputMapping", + "Mapping input DataFrame column names to :class:`tf.Operation` names", + typeConverter=SparkDLTypeConverters.asColumnToTensorMap) def setInputMapping(self, value): return self._set(inputMapping=value) @@ -203,9 +212,11 @@ class HasTFInputGraph(Params): """ Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. """ - tfInputGraph = Param(Params._dummy(), "tfInputGraph", - "TensorFlow Graph object", - typeConverter=SparkDLTypeConverters.toTFInputGraph) + tfInputGraph = Param( + Params._dummy(), + "tfInputGraph", + "TensorFlow Graph object", + typeConverter=SparkDLTypeConverters.toTFInputGraph) def __init__(self): super(HasTFInputGraph, self).__init__() @@ -222,8 +233,11 @@ class HasTFHParams(Params): """ Mixin for TensorFlow model hyper-parameters """ - tfHParams = Param(Params._dummy(), "hparams", "instance of :class:`tf.contrib.training.HParams`", - typeConverter=SparkDLTypeConverters.toTFHParams) + tfHParams = Param( + Params._dummy(), + "hparams", + "instance of :class:`tf.contrib.training.HParams`", + typeConverter=SparkDLTypeConverters.toTFHParams) def setTFHParams(self, value): return self._set(tfHParam=value) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 255c264a..21c9524a 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -21,10 +21,9 @@ from pyspark.ml import Transformer import sparkdl.graph.utils as tfx -from sparkdl.transformers.utils import TFInputGraphBuilder -from sparkdl.transformers.param import ( - keyword_only, SparkDLTypeConverters, HasInputMapping, - HasOutputMapping, HasTFInputGraph, HasTFHParams) +from sparkdl.graph.input import TFInputGraphBuilder +from sparkdl.transformers.param import (keyword_only, SparkDLTypeConverters, HasInputMapping, + HasOutputMapping, HasTFInputGraph, HasTFHParams) __all__ = ['TFTransformer'] @@ -72,11 +71,8 @@ def _transform(self, dataset): with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) - out_tnsr_op_names = [tfx.as_op_name(tnsr_op_name) - for tnsr_op_name, _ in output_mapping] - tf.import_graph_def(graph_def=gin.graph_def, - name='', - return_elements=out_tnsr_op_names) + out_tnsr_op_names = [tfx.as_op_name(tnsr_op_name) for tnsr_op_name, _ in output_mapping] + tf.import_graph_def(graph_def=gin.graph_def, name='', return_elements=out_tnsr_op_names) feed_dict = dict((tfx.op_name(graph, tnsr_op_name), col_name) for col_name, tnsr_op_name in input_mapping) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 85df23a7..88eeb273 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -14,6 +14,7 @@ # from __future__ import absolute_import, division, print_function +from glob import glob import os import shutil import tempfile @@ -26,8 +27,9 @@ from pyspark.sql.types import Row from sparkdl.graph.builder import IsolatedSession +from sparkdl.graph.input import * import sparkdl.graph.utils as tfx -from sparkdl.transformers.tf_tensor import TFTransformer, TFInputGraphBuilder +from sparkdl.transformers.tf_tensor import TFTransformer from ..tests import SparkDLTestCase @@ -133,15 +135,15 @@ def test_build_from_saved_model(self): # Build the transformer from exported serving model # We are using signaures, thus must provide the keys - trans_with_sig = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromSavedModel( - saved_model_dir, tag_set=serving_tag, signature_def_key=serving_sigdef_key), - inputMapping={ - input_col: 'input_sig' - }, - outputMapping={ - 'output_sig': output_col - }) + tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( + saved_model_dir, serving_tag, serving_sigdef_key, + input_mapping={ + input_col: 'input_sig'}, + output_mapping={ + 'output_sig': output_col}) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) # Build the transformer from exported serving model # We are not using signatures, thus must provide tensor/operation names @@ -186,7 +188,31 @@ def test_build_from_checkpoint(self): z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') sess.run(w.initializer) saver = tf.train.Saver(var_list=[w]) - _ = saver.save(sess, ckpt_path_prefix, global_step=2702) + ckpt_path = saver.save(sess, ckpt_path_prefix, global_step=2702) + + # Prepare the signature_def + serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + inputs={ + 'input_sig': tf.saved_model.utils.build_tensor_info(x) + }, + outputs={ + 'output_sig': tf.saved_model.utils.build_tensor_info(z) + }) + + # A rather contrived way to add signature def to a meta_graph + serving_sigdef_key = 'prediction_signature' + meta_graph_def = tf.train.export_meta_graph() + + # Find the meta_graph file (there should be only one) + _ckpt_meta_fpaths = glob('{}/*.meta'.format(model_ckpt_dir)) + self.assertEqual(len(_ckpt_meta_fpaths), 1, msg=','.join(_ckpt_meta_fpaths)) + ckpt_meta_fpath = _ckpt_meta_fpaths[0] + + # Add signature_def to the meta_graph and serialize it + # This will overwrite the existing meta_graph_def file + meta_graph_def.signature_def[serving_sigdef_key].CopyFrom(serving_sigdef) + with open(ckpt_meta_fpath, mode='wb') as fout: + fout.write(meta_graph_def.SerializeToString()) # Get the reference data _results = [] @@ -195,7 +221,24 @@ def test_build_from_checkpoint(self): _results.append(sess.run(z, {x: arr})) out_ref = np.hstack(_results) - transformer = TFTransformer( + test_results = [] + def _add_test(transformer, msg, trs=test_results): + final_df = transformer.transform(analyzed_df) + out_tgt = grab_df_arr(final_df, output_col) + trs.append((np.allclose(out_ref, out_tgt), msg)) + + tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( + model_ckpt_dir, serving_sigdef_key, + input_mapping={ + input_col: 'input_sig'}, + output_mapping={ + 'output_sig': output_col}) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) + _add_test(trans_with_sig, 'transformer built with signature_def') + + trans_no_sig = TFTransformer( tfInputGraph=TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir), inputMapping={ input_col: 'tnsrIn' @@ -203,11 +246,13 @@ def test_build_from_checkpoint(self): outputMapping={ 'tnsrOut': output_col }) - final_df = transformer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, output_col) + _add_test(trans_no_sig, 'transformer built WITHOUT signature_def') + # First delete the resource shutil.rmtree(model_ckpt_dir, ignore_errors=True) - self.assertTrue(np.allclose(out_ref, out_tgt)) + # Then check each test result + for test_result, test_msg in test_results: + self.assertTrue(test_result, msg=test_msg) def test_multi_io(self): From 66d44e99dda76781275a53f639f7e1ba924f0226 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Wed, 13 Sep 2017 10:57:56 -0700 Subject: [PATCH 10/23] (WIP) respond to PR comments --- python/sparkdl/graph/input.py | 44 +++++++----------- python/sparkdl/transformers/param.py | 51 ++++++++++----------- python/sparkdl/transformers/tf_tensor.py | 33 +++++++++++-- python/tests/transformers/tf_tensor_test.py | 5 +- 4 files changed, 70 insertions(+), 63 deletions(-) diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index c495697d..7590be13 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -29,43 +29,20 @@ class TFInputGraph(object): """ # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. - def __init__(self, graph_def, input_mapping, output_mapping): + def __init__(self, graph_def): # tf.GraphDef self.graph_def = graph_def - if isinstance(input_mapping, dict): - input_mapping = list(input_mapping.items()) - assert isinstance(input_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(input_mapping)) - self.input_mapping = sorted(input_mapping) - - if isinstance(output_mapping, dict): - output_mapping = list(output_mapping.items()) - assert isinstance(output_mapping, list), \ - "output mapping must be a list of strings, found type {}".format(type(output_mapping)) - self.output_mapping = sorted(output_mapping) - - -def _get_params_from(gin_builder, input_mapping, output_mapping): - gin = gin_builder.build(input_mapping, output_mapping) - imap = dict(gin.input_mapping) - assert len(imap) == len(gin.input_mapping) - omap = dict(gin.output_mapping) - assert len(omap) == len(gin.output_mapping) - return gin.graph_def, imap, omap - - def get_params_from_checkpoint(checkpoint_dir, signature_def_key, input_mapping, output_mapping): assert signature_def_key is not None gin_builder = TFInputGraphBuilder.fromCheckpoint(checkpoint_dir, signature_def_key) - return _get_params_from(gin_builder, input_mapping, output_mapping) - + return gin_builder.build(input_mapping, output_mapping) def get_params_from_saved_model(saved_model_dir, tag_set, signature_def_key, input_mapping, output_mapping): assert signature_def_key is not None gin_builder = TFInputGraphBuilder.fromSavedModel(saved_model_dir, tag_set, signature_def_key) - return _get_params_from(gin_builder, input_mapping, output_mapping) + return gin_builder.build(input_mapping, output_mapping) class TFInputGraphBuilder(object): @@ -117,12 +94,19 @@ def build(self, input_mapping, output_mapping): tnsr = tnsr_or_sig fetches.append(tfx.get_tensor(graph, tnsr)) tf_output_colname = tfx.op_name(graph, tnsr) + # NOTE(phi-dbq): put the check here as it will be the entry point to construct + # a `TFInputGraph` object. + assert tf_output_colname not in _output_mapping, \ + "operation {} has multiple output tensors and ".format(tf_output_colname) + \ + "at least two of them are used in the output DataFrame. " + \ + "Operation names are used to name columns which leads to conflicts. " + \ + "You can apply `tf.identity` ops to each to avoid name conflicts." _output_mapping[tf_output_colname] = requested_colname output_mapping = _output_mapping gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - return TFInputGraph(gdef, input_mapping, output_mapping) + return TFInputGraph(gdef), input_mapping, output_mapping @classmethod def fromGraph(cls, graph): @@ -167,6 +151,8 @@ def import_graph_fn(sess): ckpt_path = tf.train.latest_checkpoint(checkpoint_dir) # NOTE(phi-dbq): we must manually load meta_graph_def to get the signature_def + # the current `import_graph_def` function seems to ignore + # any signature_def fields in a checkpoint's meta_graph_def. meta_graph_def = meta_graph_pb2.MetaGraphDef() with open("{}.meta".format(ckpt_path), 'rb') as fin: meta_graph_def.ParseFromString(fin.read()) @@ -177,7 +163,9 @@ def import_graph_fn(sess): sig_def = None if signature_def_key is not None: sig_def = meta_graph_def.signature_def[signature_def_key] - # TODO: check if sig_def is valid + assert sig_def, 'singnature_def_key {} provided, '.format(signature_def_key) + \ + 'but failed to find it from the meta_graph_def ' + \ + 'from checkpoint {}'.format(checkpoint_dir) return sig_def diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index 233de707..840a6d02 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -29,10 +29,10 @@ from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder -""" -Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. -""" +######################################################## +# Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. +######################################################## def keyword_only(func): """ @@ -98,21 +98,15 @@ def getOutputCol(self): return self.getOrDefault(self.outputCol) -""" -TensorFlow Specific Parameters -""" - +######################################################## +# New in sparkdl: TensorFlow Specific Parameters +######################################################## class SparkDLTypeConverters(object): @staticmethod def toTFGraph(value): if isinstance(value, tf.Graph): return value - elif isinstance(value, GraphFunction): - with IsolatedSession() as issn: - issn.importGraphFunction(value, prefix='') - g = issn.graph - return g else: raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) @@ -120,12 +114,6 @@ def toTFGraph(value): def toTFInputGraph(value): if isinstance(value, TFInputGraph): return value - elif isinstance(value, TFInputGraphBuilder): - return value - elif isinstance(value, tf.Graph): - return TFInputGraphBuilder.fromGraph(value) - elif isinstance(value, tf.GraphDef): - return TFInputGraphBuilder.fromGraphDef(value) else: raise TypeError("Could not convert %s to TFInputGraph" % type(value)) @@ -171,9 +159,6 @@ def converter(value): return converter -# New in sparkdl - - class HasOutputMapping(Params): """ Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs @@ -185,7 +170,11 @@ class HasOutputMapping(Params): typeConverter=SparkDLTypeConverters.asTensorToColumnMap) def setOutputMapping(self, value): - return self._set(outputMapping=value) + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assign `outputMapping` field.") def getOutputMapping(self): return self.getOrDefault(self.outputMapping) @@ -202,7 +191,11 @@ class HasInputMapping(Params): typeConverter=SparkDLTypeConverters.asColumnToTensorMap) def setInputMapping(self, value): - return self._set(inputMapping=value) + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assigne `inputMapping` field.") def getInputMapping(self): return self.getOrDefault(self.inputMapping) @@ -210,12 +203,12 @@ def getInputMapping(self): class HasTFInputGraph(Params): """ - Mixin for param tfGraph: the :class:`tf.Graph` object that represents a TensorFlow computation. + Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph. """ tfInputGraph = Param( Params._dummy(), "tfInputGraph", - "TensorFlow Graph object", + "A serializable object derived from a TensorFlow computation graph", typeConverter=SparkDLTypeConverters.toTFInputGraph) def __init__(self): @@ -223,7 +216,11 @@ def __init__(self): self._setDefault(tfInputGraph=None) def setTFInputGraph(self, value): - return self._set(tfInputGraph=value) + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assign `tfInputGraph` field.") def getTFInputGraph(self): return self.getOrDefault(self.tfInputGraph) @@ -236,7 +233,7 @@ class HasTFHParams(Params): tfHParams = Param( Params._dummy(), "hparams", - "instance of :class:`tf.contrib.training.HParams`", + "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", typeConverter=SparkDLTypeConverters.toTFHParams) def setTFHParams(self, value): diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 21c9524a..2b091979 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -21,7 +21,7 @@ from pyspark.ml import Transformer import sparkdl.graph.utils as tfx -from sparkdl.graph.input import TFInputGraphBuilder +from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder from sparkdl.transformers.param import (keyword_only, SparkDLTypeConverters, HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams) @@ -57,15 +57,38 @@ def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tf """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs - _maybe_gin = SparkDLTypeConverters.toTFInputGraph(tfInputGraph) + # The set of parameters either come from some helper functions, + # in which case type(_maybe_gin) is already TFInputGraph. + _maybe_gin = tfInputGraph + if isinstance(_maybe_gin, TFInputGraph): + return self._set(**kwargs) + + # Otherwise, `_maybe_gin` needs to be converted to TFInputGraph + # We put all the conversion logic here rather than in SparkDLTypeConverters if isinstance(_maybe_gin, TFInputGraphBuilder): - kwargs['tfInputGraph'] = _maybe_gin.build(inputMapping, outputMapping) + gin = _maybe_gin + elif isinstance(_maybe_gin, tf.Graph): + gin = TFInputGraphBuilder.fromGraph(_maybe_gin) + elif isinstance(_maybe_gin, tf.GraphDef): + gin = TFInputGraphBuilder.fromGraphDef(_maybe_gin) + else: + raise TypeError("TFTransformer expect tfInputGraph convertible to TFInputGraph, " + \ + "but the given type {} cannot be converted, ".format(type(tfInputGraph)) + \ + "please provide `tf.Graph`, `tf.GraphDef` or use one of the " + \ + "`get_params_from_*` helper functions to build parameters") + + gin, input_mapping, output_mapping = gin.build(inputMapping, outputMapping) + kwargs['tfInputGraph'] = gin + kwargs['inputMapping'] = input_mapping + kwargs['outputMapping'] = output_mapping + + # Further conanonicalization, e.g. converting dict to sorted str pairs happens here return self._set(**kwargs) def _transform(self, dataset): gin = self.getTFInputGraph() - input_mapping = gin.input_mapping - output_mapping = gin.output_mapping + input_mapping = self.getInputMapping() + output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 88eeb273..221c5f43 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -54,7 +54,6 @@ def test_build_from_tf_graph(self): # Build the TensorFlow graph with tf.Session() as sess: - #x = tf.placeholder(tf.float64, shape=[None, vec_size]) x = tfs.block(analyzed_df, 'vec') z = tf.reduce_mean(x, axis=1) graph = sess.graph @@ -68,9 +67,9 @@ def test_build_from_tf_graph(self): # Apply the transform gin_from_graph = TFInputGraphBuilder.fromGraph(graph) - for gin in [gin_from_graph, graph]: + for gin_or_graph in [gin_from_graph, graph]: transfomer = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromGraph(graph), + tfInputGraph=gin_or_graph, inputMapping={ 'vec': x }, From 9b3fe86e759bcf7a1a86595a3fa86ece60ca301e Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Wed, 13 Sep 2017 15:56:50 -0700 Subject: [PATCH 11/23] test refactor --- python/sparkdl/transformers/param.py | 5 +- python/tests/transformers/tf_tensor_test.py | 409 +++++++++----------- 2 files changed, 181 insertions(+), 233 deletions(-) diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py index 840a6d02..a8c7a891 100644 --- a/python/sparkdl/transformers/param.py +++ b/python/sparkdl/transformers/param.py @@ -31,7 +31,8 @@ from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder ######################################################## -# Copied from PySpark for backward compatibility. First in Apache Spark version 2.1.1. +# Copied from PySpark for backward compatibility. +# They first appeared in Apache Spark version 2.1.1. ######################################################## def keyword_only(func): @@ -99,7 +100,7 @@ def getOutputCol(self): ######################################################## -# New in sparkdl: TensorFlow Specific Parameters +# New in sparkdl ######################################################## class SparkDLTypeConverters(object): diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 221c5f43..d6fe9296 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -14,6 +14,7 @@ # from __future__ import absolute_import, division, print_function +from contextlib import contextmanager from glob import glob import os import shutil @@ -33,77 +34,142 @@ from ..tests import SparkDLTestCase -def grab_df_arr(df, output_col): - """ Stack the numpy array from a DataFrame column """ - return np.array([row.asDict()[output_col] - for row in df.select(output_col).collect()]) class TFTransformerTest(SparkDLTestCase): - def _get_rand_vec_df(self, num_rows, vec_size): - return self.session.createDataFrame( - Row(idx=idx, vec=np.random.randn(vec_size).tolist()) - for idx in range(num_rows)) - - def test_build_from_tf_graph(self): - # Build a simple input DataFrame - vec_size = 17 - num_vecs = 31 - df = self._get_rand_vec_df(num_vecs, vec_size) + def setUp(self): + self.vec_size = 17 + self.num_vecs = 31 + + self.input_col = 'vec' + self.input_op_name = 'tnsrOpIn' + self.output_col = 'outputCol' + self.output_op_name = 'tnsrOpOut' + + self.input_mapping = {} + self.output_mapping = {} + + self.transformers = [] + self.test_case_results = [] + # Build a temporary directory, which might or might not be used by the test + self.model_output_root = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.model_output_root, ignore_errors=True) + + def build_standard_transformers(self, sess, gin_builder_convertible): + def _add_transformer(imap, omap): + trnsfmr = TFTransformer( + tfInputGraph=gin_builder_convertible, inputMapping=imap, outputMapping=omap) + self.transformers.append(trnsfmr) + + _add_transformer(self.input_mapping, self.output_mapping) + + imap = [(col, tfx.get_tensor(sess.graph, op_name)) + for col, op_name in self.input_mapping.items()] + omap = [(tfx.get_tensor(sess.graph, op_name), col) + for op_name, col in self.output_mapping.items()] + _add_transformer(imap, omap) + + @contextmanager + def run_test_in_tf_session(self, replica=1): + """ [THIS IS NOT A TEST]: encapsulate general test workflow """ + + if replica > 1: + for i in range(replica): + colname = '{}_replica{:03d}'.format(self.input_col, i) + tnsr_op_name = '{}_replica{:03d}'.format(self.input_op_name, i) + self.input_mapping[colname] = tnsr_op_name + + colname = '{}_replica{:03d}'.format(self.output_col, i) + tnsr_op_name = '{}_replica{:03d}'.format(self.output_op_name, i) + self.output_mapping[tnsr_op_name] = colname + else: + self.input_mapping = {self.input_col: self.input_op_name} + self.output_mapping = {self.output_op_name: self.output_col} + + # Build local features and DataFrame from it + local_features = [] + for idx in range(self.num_vecs): + _dict = {'idx': idx} + for colname, _ in self.input_mapping.items(): + _dict[colname] = np.random.randn(self.vec_size).tolist() + + local_features.append(Row(**_dict)) + + df = self.session.createDataFrame(local_features) analyzed_df = tfs.analyze(df) # Build the TensorFlow graph - with tf.Session() as sess: - x = tfs.block(analyzed_df, 'vec') - z = tf.reduce_mean(x, axis=1) - graph = sess.graph + graph = tf.Graph() + with tf.Session(graph=graph) as sess: + # Build test graph and transformers from here + yield sess # Get the reference data _results = [] - for row in df.collect(): - arr = np.array(row.vec)[np.newaxis, :] - _results.append(sess.run(z, {x: arr})) + for row in local_features: + fetches = [tfx.get_tensor(graph, tnsr_op_name) + for tnsr_op_name in self.output_mapping.keys()] + feed_dict = {} + for colname, tnsr_op_name in self.input_mapping.items(): + tnsr = tfx.get_tensor(graph, tnsr_op_name) + feed_dict[tnsr] = np.array(row[colname])[np.newaxis, :] + + curr_res = sess.run(fetches, feed_dict=feed_dict) + _results.append(np.ravel(curr_res)) + out_ref = np.hstack(_results) # Apply the transform - gin_from_graph = TFInputGraphBuilder.fromGraph(graph) - for gin_or_graph in [gin_from_graph, graph]: - transfomer = TFTransformer( - tfInputGraph=gin_or_graph, - inputMapping={ - 'vec': x - }, - outputMapping={ - z: 'outCol' - }) - final_df = transfomer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, 'outCol') - self.assertTrue(np.allclose(out_ref, out_tgt)) + for transfomer in self.transformers: + out_df = transfomer.transform(analyzed_df) + out_colnames = [] + for old_colname, new_colname in self.output_mapping.items(): + out_colnames.append(new_colname) + if old_colname != new_colname: + out_df = out_df.withColumnRenamed(old_colname, new_colname) + _results = [] + for row in out_df.select(out_colnames).collect(): + curr_res = [row[colname] for colname in out_colnames] + _results.append(np.ravel(curr_res)) + out_tgt = np.hstack(_results) + + self.assertTrue(np.allclose(out_ref, out_tgt), + msg=repr(transfomer)) + + + def test_build_from_tf_graph(self): + with self.run_test_in_tf_session() as sess: + # Begin building graph + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + _ = tf.reduce_mean(x, axis=1, name=self.output_op_name) + # End building graph + + # Begin building transformers + self.build_standard_transformers(sess, sess.graph) + self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) + gdef = sess.graph.as_graph_def() + self.build_standard_transformers(sess, gdef) + self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gdef)) + # End building transformers - def test_build_from_saved_model(self): - # Setup dataset - vec_size = 17 - num_vecs = 31 - df = self._get_rand_vec_df(num_vecs, vec_size) - analyzed_df = tfs.analyze(df) - input_col = 'vec' - output_col = 'outputCol' + def test_build_from_saved_model(self): # Setup saved model export directory - saved_model_root = tempfile.mkdtemp() + saved_model_root = self.model_output_root saved_model_dir = os.path.join(saved_model_root, 'saved_model') serving_tag = "serving_tag" serving_sigdef_key = 'prediction_signature' - builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) - with tf.Session(graph=tf.Graph()) as sess: + + with self.run_test_in_tf_session() as sess: # Model definition: begin - x = tf.placeholder(tf.float64, shape=[None, vec_size], name='tnsrIn') - #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) - w = tf.Variable(tf.random_normal([vec_size], dtype=tf.float64), + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), dtype=tf.float64, name='varW') - z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') + z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) # Model definition ends sess.run(w.initializer) @@ -120,74 +186,48 @@ def test_build_from_saved_model(self): builder.add_meta_graph_and_variables(sess, [serving_tag], signature_def_map={ - serving_sigdef_key: serving_sigdef - }) - # Get the reference data - _results = [] - for row in df.collect(): - arr = np.array(row.vec)[np.newaxis, :] - _results.append(sess.run(z, {x: arr})) - out_ref = np.hstack(_results) - - # Save the model - builder.save() - - # Build the transformer from exported serving model - # We are using signaures, thus must provide the keys - tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( - saved_model_dir, serving_tag, serving_sigdef_key, - input_mapping={ - input_col: 'input_sig'}, - output_mapping={ - 'output_sig': output_col}) - trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - inputMapping=inputMapping, - outputMapping=outputMapping) - - # Build the transformer from exported serving model - # We are not using signatures, thus must provide tensor/operation names - trans_no_sig = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromSavedModel( - saved_model_dir, tag_set=serving_tag, signature_def_key=None), - inputMapping={ - input_col: 'tnsrIn' - }, - outputMapping={ - 'tnsrOut': output_col - }) - - df_trans_with_sig = trans_with_sig.transform(analyzed_df) - df_trans_no_sig = trans_no_sig.transform(analyzed_df) - out_with_sig_tgt = grab_df_arr(df_trans_with_sig, output_col) - out_no_sig_tgt = grab_df_arr(df_trans_no_sig, output_col) - # Cleanup the resources - shutil.rmtree(saved_model_root, ignore_errors=True) - self.assertTrue(np.allclose(out_ref, out_with_sig_tgt)) - self.assertTrue(np.allclose(out_ref, out_no_sig_tgt)) + serving_sigdef_key: serving_sigdef}) + builder.save() + + # Build the transformer from exported serving model + # We are using signaures, thus must provide the keys + tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( + saved_model_dir, serving_tag, serving_sigdef_key, + input_mapping={ + self.input_col: 'input_sig'}, + output_mapping={ + 'output_sig': self.output_col}) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) + self.transformers.append(trans_with_sig) + + # Build the transformer from exported serving model + # We are not using signatures, thus must provide tensor/operation names + gin_builder = TFInputGraphBuilder.fromSavedModel( + saved_model_dir, tag_set=serving_tag, signature_def_key=None) + self.build_standard_transformers(sess, gin_builder) def test_build_from_checkpoint(self): - vec_size = 17 - num_vecs = 31 - df = self._get_rand_vec_df(num_vecs, vec_size) - analyzed_df = tfs.analyze(df) - input_col = 'vec' - output_col = 'outputCol' - + """ + Test constructing a Transformer from a TensorFlow training checkpoint + """ # Build the TensorFlow graph - model_ckpt_dir = tempfile.mkdtemp() + model_ckpt_dir = self.model_output_root ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') + serving_sigdef_key = 'prediction_signature' # Warning: please use a new graph for each test cases # or the tests could affect one another - with tf.Session(graph=tf.Graph()) as sess: - x = tf.placeholder(tf.float64, shape=[None, vec_size], name='tnsrIn') + with self.run_test_in_tf_session() as sess: + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) - w = tf.Variable(tf.random_normal([vec_size], dtype=tf.float64), + w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), dtype=tf.float64, name='varW') - z = tf.reduce_mean(x * w, axis=1, name='tnsrOut') + z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) sess.run(w.initializer) saver = tf.train.Saver(var_list=[w]) - ckpt_path = saver.save(sess, ckpt_path_prefix, global_step=2702) + _ = saver.save(sess, ckpt_path_prefix, global_step=2702) # Prepare the signature_def serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( @@ -199,7 +239,6 @@ def test_build_from_checkpoint(self): }) # A rather contrived way to add signature def to a meta_graph - serving_sigdef_key = 'prediction_signature' meta_graph_def = tf.train.export_meta_graph() # Find the meta_graph file (there should be only one) @@ -213,109 +252,42 @@ def test_build_from_checkpoint(self): with open(ckpt_meta_fpath, mode='wb') as fout: fout.write(meta_graph_def.SerializeToString()) - # Get the reference data - _results = [] - for row in df.collect(): - arr = np.array(row.vec)[np.newaxis, :] - _results.append(sess.run(z, {x: arr})) - out_ref = np.hstack(_results) + tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( + model_ckpt_dir, serving_sigdef_key, + input_mapping={ + self.input_col: 'input_sig'}, + output_mapping={ + 'output_sig': self.output_col}) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) + self.transformers.append(trans_with_sig) - test_results = [] - def _add_test(transformer, msg, trs=test_results): - final_df = transformer.transform(analyzed_df) - out_tgt = grab_df_arr(final_df, output_col) - trs.append((np.allclose(out_ref, out_tgt), msg)) - - tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( - model_ckpt_dir, serving_sigdef_key, - input_mapping={ - input_col: 'input_sig'}, - output_mapping={ - 'output_sig': output_col}) - trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - inputMapping=inputMapping, - outputMapping=outputMapping) - _add_test(trans_with_sig, 'transformer built with signature_def') - - trans_no_sig = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir), - inputMapping={ - input_col: 'tnsrIn' - }, - outputMapping={ - 'tnsrOut': output_col - }) - _add_test(trans_no_sig, 'transformer built WITHOUT signature_def') - - # First delete the resource - shutil.rmtree(model_ckpt_dir, ignore_errors=True) - # Then check each test result - for test_result, test_msg in test_results: - self.assertTrue(test_result, msg=test_msg) + gin_builder = TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir) + self.build_standard_transformers(sess, gin_builder) def test_multi_io(self): - # Build a simple input DataFrame - vec_size = 17 - num_vecs = 31 - _df = self._get_rand_vec_df(num_vecs, vec_size) - df_x = _df.withColumnRenamed('vec', 'vec_x') - _df = self._get_rand_vec_df(num_vecs, vec_size) - df_y = _df.withColumnRenamed('vec', 'vec_y') - df = df_x.join(df_y, on='idx', how='inner') - analyzed_df = tfs.analyze(df) - # Build the TensorFlow graph - with tf.Session() as sess: - x = tfs.block(analyzed_df, 'vec_x') - y = tfs.block(analyzed_df, 'vec_y') - p = tf.reduce_mean(x + y, axis=1) - q = tf.reduce_mean(x - y, axis=1) - graph = sess.graph + with self.run_test_in_tf_session(replica=2) as sess: + xs = [] + for tnsr_op_name in self.input_mapping.values(): + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=tnsr_op_name) + xs.append(x) - # Get the reference data - p_out_ref = [] - q_out_ref = [] - for row in df.collect(): - arr_x = np.array(row['vec_x'])[np.newaxis, :] - arr_y = np.array(row['vec_y'])[np.newaxis, :] - p_val, q_val = sess.run([p, q], {x: arr_x, y: arr_y}) - p_out_ref.append(p_val) - q_out_ref.append(q_val) - p_out_ref = np.hstack(p_out_ref) - q_out_ref = np.hstack(q_out_ref) + zs = [] + for i, tnsr_op_name in enumerate(self.output_mapping.keys()): + z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) + zs.append(z) - # Apply the transform - transfomer = TFTransformer( - tfInputGraph=TFInputGraphBuilder.fromGraph(graph), - inputMapping={ - 'vec_x': x, - 'vec_y': y - }, - outputMapping={ - p: 'outcol_p', - q: 'outcol_q' - }) - final_df = transfomer.transform(analyzed_df) - p_out_tgt = grab_df_arr(final_df, 'outcol_p') - q_out_tgt = grab_df_arr(final_df, 'outcol_q') - - self.assertTrue(np.allclose(p_out_ref, p_out_tgt)) - self.assertTrue(np.allclose(q_out_ref, q_out_tgt)) + self.build_standard_transformers(sess, sess.graph) + self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) def test_mixed_keras_graph(self): - - vec_size = 17 - num_vecs = 137 - df = self._get_rand_vec_df(num_vecs, vec_size) - analyzed_df = tfs.analyze(df) - - input_col = 'vec' - output_col = 'outCol' - # Build the graph: the output should have the same leading/batch dimension with IsolatedSession(using_keras=True) as issn: - tnsr_in = tfs.block(analyzed_df, input_col) + tnsr_in = tf.placeholder( + tf.double, shape=[None, self.vec_size], name=self.input_op_name) inp = tf.expand_dims(tnsr_in, axis=2) # Keras layers does not take tf.double inp = tf.cast(inp, tf.float32) @@ -325,7 +297,7 @@ def test_mixed_keras_graph(self): dense = Dense(1)(flat) # We must keep the leading dimension of the output redsum = tf.reduce_sum(dense, axis=1) - tnsr_out = tf.cast(redsum, tf.double, name='TnsrOut') + tnsr_out = tf.cast(redsum, tf.double, name=self.output_op_name) # Initialize the variables init_op = tf.global_variables_initializer() @@ -333,35 +305,10 @@ def test_mixed_keras_graph(self): # We could train the model ... but skip it here gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) - with IsolatedSession() as issn: - # Import the graph function object - feeds, fetches = issn.importGraphFunction(gfn, prefix='') - - # Rename the input column name to the feed op's name - orig_in_name = tfx.op_name(issn.graph, feeds[0]) - input_df = analyzed_df.withColumnRenamed(input_col, orig_in_name) - - # Do the actual computation - output_df = tfs.map_blocks(fetches, input_df) - - # Rename the output column (by default, the name of the fetch op's name) - orig_out_name = tfx.op_name(issn.graph, fetches[0]) - final_df = output_df.withColumnRenamed(orig_out_name, output_col) - - arr_ref = grab_df_arr(final_df, output_col) - - # Using the Transformer - gin_from_gdef = TFInputGraphBuilder.fromGraphDef(gfn.graph_def) - for gin in [gin_from_gdef, gfn.graph_def]: - transformer = TFTransformer( - tfInputGraph=gin, - inputMapping={ - input_col: gfn.input_names[0] - }, - outputMapping={ - gfn.output_names[0]: output_col - }) + with self.run_test_in_tf_session() as sess: + tf.import_graph_def(gfn.graph_def, name='') - transformed_df = transformer.transform(analyzed_df) - arr_tgt = grab_df_arr(transformed_df, output_col) - self.assertTrue(np.allclose(arr_ref, arr_tgt)) + self.build_standard_transformers(sess, sess.graph) + self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) + self.build_standard_transformers(sess, gfn.graph_def) + self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gfn.graph_def)) From dbd9aaad4860d4185e67088cb7b34be9d66042fd Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 15 Sep 2017 20:05:39 -0700 Subject: [PATCH 12/23] (wip) consolidating params --- python/sparkdl/param/converters.py | 95 ++++++++++ python/sparkdl/param/shared_params.py | 158 +++++++++++------ python/sparkdl/transformers/param.py | 244 -------------------------- 3 files changed, 197 insertions(+), 300 deletions(-) create mode 100644 python/sparkdl/param/converters.py delete mode 100644 python/sparkdl/transformers/param.py diff --git a/python/sparkdl/param/converters.py b/python/sparkdl/param/converters.py new file mode 100644 index 00000000..23fdd9dd --- /dev/null +++ b/python/sparkdl/param/converters.py @@ -0,0 +1,95 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from functools import wraps +import six + +import keras +import tensorflow as tf + +from sparkdl.graph.builder import GraphFunction, IsolatedSession +import sparkdl.graph.utils as tfx +from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder +import sparkdl.utils.keras_model as kmutil + +class SparkDLTypeConverters(object): + @staticmethod + def toTFGraph(value): + if isinstance(value, tf.Graph): + return value + else: + raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) + + @staticmethod + def toTFInputGraph(value): + if isinstance(value, TFInputGraph): + return value + else: + raise TypeError("Could not convert %s to TFInputGraph" % type(value)) + + @staticmethod + def asColumnToTensorMap(value): + if isinstance(value, dict): + strs_pair_seq = [(k, tfx.as_op_name(v)) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def asTensorToColumnMap(value): + if isinstance(value, dict): + strs_pair_seq = [(tfx.as_op_name(k), v) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def toTFHParams(value): + if isinstance(value, tf.contrib.training.HParams): + return value + else: + raise TypeError("Could not convert %s to TensorFlow HParams" % type(value)) + + @staticmethod + def toStringOrTFTensor(value): + if isinstance(value, tf.Tensor): + return value + else: + try: + return TypeConverters.toString(value) + except TypeError: + raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) + + @staticmethod + def supportedNameConverter(supportedList): + def converter(value): + if value in supportedList: + return value + else: + raise TypeError("%s %s is not in the supported list." % type(value), str(value)) + + return converter + + @staticmethod + def toKerasLoss(value): + if kmutil.is_valid_loss_function(value): + return value + raise ValueError( + "Named loss not supported in Keras: {} type({})".format(value, type(value))) + + @staticmethod + def toKerasOptimizer(value): + if kmutil.is_valid_optimizer(value): + return value + raise TypeError( + "Named optimizer not supported in Keras: {} type({})".format(value, type(value))) diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index e169e891..8e1b9741 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -20,14 +20,21 @@ """ from functools import wraps +import six +import keras import tensorflow as tf -from pyspark.ml.param import Param, Params, TypeConverters - +from sparkdl.graph.builder import GraphFunction, IsolatedSession +import sparkdl.graph.utils as tfx +from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder import sparkdl.utils.keras_model as kmutil -# From pyspark + +######################################################## +# Copied from PySpark for backward compatibility. +# They first appeared in Apache Spark version 2.1.1. +######################################################## def keyword_only(func): """ @@ -36,12 +43,14 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper @@ -50,10 +59,8 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasInputCol, self).__init__() + inputCol = Param( + Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) def setInputCol(self, value): """ @@ -73,8 +80,8 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - outputCol = Param(Params._dummy(), - "outputCol", "output column name.", typeConverter=TypeConverters.toString) + outputCol = Param( + Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) def __init__(self): super(HasOutputCol, self).__init__() @@ -92,54 +99,9 @@ def getOutputCol(self): """ return self.getOrDefault(self.outputCol) -############################################ +######################################################## # New in sparkdl -############################################ - -class SparkDLTypeConverters(object): - - @staticmethod - def toStringOrTFTensor(value): - if isinstance(value, tf.Tensor): - return value - else: - try: - return TypeConverters.toString(value) - except TypeError: - raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) - - @staticmethod - def toTFGraph(value): - # TODO: we may want to support tf.GraphDef in the future instead of tf.Graph since user - # is less likely to mess up using GraphDef vs Graph (e.g. constants vs variables). - if isinstance(value, tf.Graph): - return value - else: - raise TypeError("Could not convert %s to tensorflow.Graph type" % type(value)) - - @staticmethod - def supportedNameConverter(supportedList): - def converter(value): - if value in supportedList: - return value - else: - raise TypeError("%s %s is not in the supported list." % type(value), str(value)) - - return converter - - @staticmethod - def toKerasLoss(value): - if kmutil.is_valid_loss_function(value): - return value - raise ValueError( - "Named loss not supported in Keras: {} type({})".format(value, type(value))) - - @staticmethod - def toKerasOptimizer(value): - if kmutil.is_valid_optimizer(value): - return value - raise TypeError( - "Named optimizer not supported in Keras: {} type({})".format(value, type(value))) +######################################################## class HasOutputNodeName(Params): @@ -233,3 +195,87 @@ def seKerasLoss(self, value): def getKerasLoss(self): return self.getOrDefault(self.kerasLoss) + + +class HasOutputMapping(Params): + """ + Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs + """ + outputMapping = Param( + Params._dummy(), + "outputMapping", + "Mapping output :class:`tf.Operation` names to DataFrame column names", + typeConverter=SparkDLTypeConverters.asTensorToColumnMap) + + def setOutputMapping(self, value): + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assign `outputMapping` field.") + + def getOutputMapping(self): + return self.getOrDefault(self.outputMapping) + + +class HasInputMapping(Params): + """ + Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs + """ + inputMapping = Param( + Params._dummy(), + "inputMapping", + "Mapping input DataFrame column names to :class:`tf.Operation` names", + typeConverter=SparkDLTypeConverters.asColumnToTensorMap) + + def setInputMapping(self, value): + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assigne `inputMapping` field.") + + def getInputMapping(self): + return self.getOrDefault(self.inputMapping) + + +class HasTFInputGraph(Params): + """ + Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph. + """ + tfInputGraph = Param( + Params._dummy(), + "tfInputGraph", + "A serializable object derived from a TensorFlow computation graph", + typeConverter=SparkDLTypeConverters.toTFInputGraph) + + def __init__(self): + super(HasTFInputGraph, self).__init__() + self._setDefault(tfInputGraph=None) + + def setTFInputGraph(self, value): + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assign `tfInputGraph` field.") + + def getTFInputGraph(self): + return self.getOrDefault(self.tfInputGraph) + + +class HasTFHParams(Params): + """ + Mixin for TensorFlow model hyper-parameters + """ + tfHParams = Param( + Params._dummy(), + "hparams", + "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", + typeConverter=SparkDLTypeConverters.toTFHParams) + + def setTFHParams(self, value): + return self._set(tfHParam=value) + + def getTFHParams(self): + return self.getOrDefault(self.tfHParams) diff --git a/python/sparkdl/transformers/param.py b/python/sparkdl/transformers/param.py deleted file mode 100644 index a8c7a891..00000000 --- a/python/sparkdl/transformers/param.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright 2017 Databricks, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Some parts are copied from pyspark.ml.param.shared and some are complementary -to pyspark.ml.param. The copy is due to some useful pyspark fns/classes being -private APIs. -""" - -from functools import wraps -import six - -import keras -import tensorflow as tf - -from pyspark.ml.param import Param, Params, TypeConverters - -from sparkdl.graph.builder import GraphFunction, IsolatedSession -import sparkdl.graph.utils as tfx -from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder - -######################################################## -# Copied from PySpark for backward compatibility. -# They first appeared in Apache Spark version 2.1.1. -######################################################## - -def keyword_only(func): - """ - A decorator that forces keyword arguments in the wrapped method - and saves actual input keyword arguments in `_input_kwargs`. - - .. note:: Should only be used to wrap a method where first arg is `self` - """ - - @wraps(func) - def wrapper(self, *args, **kwargs): - if len(args) > 0: - raise TypeError("Method %s forces keyword arguments." % func.__name__) - self._input_kwargs = kwargs - return func(self, **kwargs) - - return wrapper - - -class HasInputCol(Params): - """ - Mixin for param inputCol: input column name. - """ - - inputCol = Param( - Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) - - def setInputCol(self, value): - """ - Sets the value of :py:attr:`inputCol`. - """ - return self._set(inputCol=value) - - def getInputCol(self): - """ - Gets the value of inputCol or its default value. - """ - return self.getOrDefault(self.inputCol) - - -class HasOutputCol(Params): - """ - Mixin for param outputCol: output column name. - """ - - outputCol = Param( - Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasOutputCol, self).__init__() - self._setDefault(outputCol=self.uid + '__output') - - def setOutputCol(self, value): - """ - Sets the value of :py:attr:`outputCol`. - """ - return self._set(outputCol=value) - - def getOutputCol(self): - """ - Gets the value of outputCol or its default value. - """ - return self.getOrDefault(self.outputCol) - - -######################################################## -# New in sparkdl -######################################################## - -class SparkDLTypeConverters(object): - @staticmethod - def toTFGraph(value): - if isinstance(value, tf.Graph): - return value - else: - raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) - - @staticmethod - def toTFInputGraph(value): - if isinstance(value, TFInputGraph): - return value - else: - raise TypeError("Could not convert %s to TFInputGraph" % type(value)) - - @staticmethod - def asColumnToTensorMap(value): - if isinstance(value, dict): - strs_pair_seq = [(k, tfx.as_op_name(v)) for k, v in value.items()] - return sorted(strs_pair_seq) - raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) - - @staticmethod - def asTensorToColumnMap(value): - if isinstance(value, dict): - strs_pair_seq = [(tfx.as_op_name(k), v) for k, v in value.items()] - return sorted(strs_pair_seq) - raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) - - @staticmethod - def toTFHParams(value): - if isinstance(value, tf.contrib.training.HParams): - return value - else: - raise TypeError("Could not convert %s to TensorFlow HParams" % type(value)) - - @staticmethod - def toStringOrTFTensor(value): - if isinstance(value, tf.Tensor): - return value - else: - try: - return TypeConverters.toString(value) - except TypeError: - raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) - - @staticmethod - def supportedNameConverter(supportedList): - def converter(value): - if value in supportedList: - return value - else: - raise TypeError("%s %s is not in the supported list." % type(value), str(value)) - - return converter - - -class HasOutputMapping(Params): - """ - Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs - """ - outputMapping = Param( - Params._dummy(), - "outputMapping", - "Mapping output :class:`tf.Operation` names to DataFrame column names", - typeConverter=SparkDLTypeConverters.asTensorToColumnMap) - - def setOutputMapping(self, value): - # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the - # serializable TFInputGraph object once the inputMapping and outputMapping - # parameters are provided. - raise NotImplementedError( - "Please use the Transformer's constructor to assign `outputMapping` field.") - - def getOutputMapping(self): - return self.getOrDefault(self.outputMapping) - - -class HasInputMapping(Params): - """ - Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs - """ - inputMapping = Param( - Params._dummy(), - "inputMapping", - "Mapping input DataFrame column names to :class:`tf.Operation` names", - typeConverter=SparkDLTypeConverters.asColumnToTensorMap) - - def setInputMapping(self, value): - # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the - # serializable TFInputGraph object once the inputMapping and outputMapping - # parameters are provided. - raise NotImplementedError( - "Please use the Transformer's constructor to assigne `inputMapping` field.") - - def getInputMapping(self): - return self.getOrDefault(self.inputMapping) - - -class HasTFInputGraph(Params): - """ - Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph. - """ - tfInputGraph = Param( - Params._dummy(), - "tfInputGraph", - "A serializable object derived from a TensorFlow computation graph", - typeConverter=SparkDLTypeConverters.toTFInputGraph) - - def __init__(self): - super(HasTFInputGraph, self).__init__() - self._setDefault(tfInputGraph=None) - - def setTFInputGraph(self, value): - # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the - # serializable TFInputGraph object once the inputMapping and outputMapping - # parameters are provided. - raise NotImplementedError( - "Please use the Transformer's constructor to assign `tfInputGraph` field.") - - def getTFInputGraph(self): - return self.getOrDefault(self.tfInputGraph) - - -class HasTFHParams(Params): - """ - Mixin for TensorFlow model hyper-parameters - """ - tfHParams = Param( - Params._dummy(), - "hparams", - "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", - typeConverter=SparkDLTypeConverters.toTFHParams) - - def setTFHParams(self, value): - return self._set(tfHParam=value) - - def getTFHParams(self): - return self.getOrDefault(self.tfHParams) From 45722059bac739dc0dba2a2346aca27b842e5c49 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 12:32:33 -0700 Subject: [PATCH 13/23] rebase upstream --- python/sparkdl/param/__init__.py | 3 ++- python/sparkdl/param/converters.py | 2 ++ python/sparkdl/param/shared_params.py | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/sparkdl/param/__init__.py b/python/sparkdl/param/__init__.py index 98a8f7dd..1080b29f 100644 --- a/python/sparkdl/param/__init__.py +++ b/python/sparkdl/param/__init__.py @@ -15,6 +15,7 @@ from sparkdl.param.shared_params import ( keyword_only, HasInputCol, HasOutputCol, HasLabelCol, HasKerasModel, - HasKerasLoss, HasKerasOptimizer, HasOutputNodeName, SparkDLTypeConverters) + HasKerasLoss, HasKerasOptimizer, HasOutputNodeName) +from sparkdl.param.converters import SparkDLTypeConverters from sparkdl.param.image_params import ( CanLoadImage, HasInputImageNodeName, HasOutputMode, OUTPUT_MODES) diff --git a/python/sparkdl/param/converters.py b/python/sparkdl/param/converters.py index 23fdd9dd..d9b8e512 100644 --- a/python/sparkdl/param/converters.py +++ b/python/sparkdl/param/converters.py @@ -19,6 +19,8 @@ import keras import tensorflow as tf +from pyspark.ml.param import TypeConverters + from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index 8e1b9741..64d4e6f1 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -25,9 +25,12 @@ import keras import tensorflow as tf +from pyspark.ml.param import Param, Params, TypeConverters + from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder +from sparkdl.param.converters import SparkDLTypeConverters import sparkdl.utils.keras_model as kmutil From 1cc7591daafb15e8464f05edd3d3ceb4b02343a4 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 13:00:40 -0700 Subject: [PATCH 14/23] import params fix --- python/sparkdl/param/__init__.py | 7 +++++-- python/sparkdl/transformers/tf_tensor.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/sparkdl/param/__init__.py b/python/sparkdl/param/__init__.py index 1080b29f..ca1a9121 100644 --- a/python/sparkdl/param/__init__.py +++ b/python/sparkdl/param/__init__.py @@ -14,8 +14,11 @@ # from sparkdl.param.shared_params import ( - keyword_only, HasInputCol, HasOutputCol, HasLabelCol, HasKerasModel, - HasKerasLoss, HasKerasOptimizer, HasOutputNodeName) + keyword_only, HasInputCol, HasOutputCol, HasLabelCol, + # TFTransformer Params + HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams, + # Keras Estimator Params + HasKerasModel, HasKerasLoss, HasKerasOptimizer, HasOutputNodeName) from sparkdl.param.converters import SparkDLTypeConverters from sparkdl.param.image_params import ( CanLoadImage, HasInputImageNodeName, HasOutputMode, OUTPUT_MODES) diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 2b091979..027c6043 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -22,8 +22,8 @@ import sparkdl.graph.utils as tfx from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder -from sparkdl.transformers.param import (keyword_only, SparkDLTypeConverters, HasInputMapping, - HasOutputMapping, HasTFInputGraph, HasTFHParams) +from sparkdl.param import (keyword_only, SparkDLTypeConverters, HasInputMapping, + HasOutputMapping, HasTFInputGraph, HasTFHParams) __all__ = ['TFTransformer'] From 2fc6787bcd5246f93b589d96c9fcaa604f98d630 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 16:10:22 -0700 Subject: [PATCH 15/23] (wip) TFInputGraph impl --- python/sparkdl/graph/input.py | 231 +++++++++++++++++++--------------- 1 file changed, 132 insertions(+), 99 deletions(-) diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index 7590be13..97e99a33 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -18,7 +18,7 @@ import sparkdl.graph.utils as tfx -__all__ = ["TFInputGraphBuilder", "get_params_from_checkpoint", "get_params_from_saved_model"] +__all__ = ["TFInputGraph"] class TFInputGraph(object): @@ -27,105 +27,36 @@ class TFInputGraph(object): [WARNING] This class should not be called by any user code. """ + def __init__(self): + raise NotImplementedError( + "Please do NOT construct TFInputGraph directly. Instead, use one of the helper functions") - # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. - def __init__(self, graph_def): - # tf.GraphDef - self.graph_def = graph_def - -def get_params_from_checkpoint(checkpoint_dir, signature_def_key, input_mapping, output_mapping): - assert signature_def_key is not None - gin_builder = TFInputGraphBuilder.fromCheckpoint(checkpoint_dir, signature_def_key) - return gin_builder.build(input_mapping, output_mapping) - -def get_params_from_saved_model(saved_model_dir, tag_set, signature_def_key, input_mapping, - output_mapping): - assert signature_def_key is not None - gin_builder = TFInputGraphBuilder.fromSavedModel(saved_model_dir, tag_set, signature_def_key) - return gin_builder.build(input_mapping, output_mapping) - - -class TFInputGraphBuilder(object): - """ - Create a builder function so as to be able to compile graph for inference. - The actual compilation will be done at the time when the - inputs (feeds) and outputs (fetches) are provided. - :param graph_import_fn: `tf.Session` -> `tf.signature_def`, load a graph to the provided session. - If the meta_graph contains a `signature_def`, return it. - """ - - def __init__(self, graph_import_fn): - # Return signature_def if the underlying graph contains one - self.graph_import_fn = graph_import_fn - - def build(self, input_mapping, output_mapping): - """ - Create a serializable TensorFlow graph representation - :param input_mapping: dict, from input DataFrame column name to internal graph name. - """ - graph = tf.Graph() - with tf.Session(graph=graph) as sess: - sig_def = self.graph_import_fn(sess) - - # Append feeds and input mapping - _input_mapping = {} - if isinstance(input_mapping, dict): - input_mapping = input_mapping.items() - for input_colname, tnsr_or_sig in input_mapping: - if sig_def: - tnsr = sig_def.inputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - _input_mapping[input_colname] = tfx.op_name(graph, tnsr) - input_mapping = _input_mapping - - # Append fetches and output mapping - fetches = [] - _output_mapping = {} - # By default the output columns will have the name of their - # corresponding `tf.Graph` operation names. - # We have to convert them to the user specified output names - if isinstance(output_mapping, dict): - output_mapping = output_mapping.items() - for tnsr_or_sig, requested_colname in output_mapping: - if sig_def: - tnsr = sig_def.outputs[tnsr_or_sig].name - else: - tnsr = tnsr_or_sig - fetches.append(tfx.get_tensor(graph, tnsr)) - tf_output_colname = tfx.op_name(graph, tnsr) - # NOTE(phi-dbq): put the check here as it will be the entry point to construct - # a `TFInputGraph` object. - assert tf_output_colname not in _output_mapping, \ - "operation {} has multiple output tensors and ".format(tf_output_colname) + \ - "at least two of them are used in the output DataFrame. " + \ - "Operation names are used to name columns which leads to conflicts. " + \ - "You can apply `tf.identity` ops to each to avoid name conflicts." - _output_mapping[tf_output_colname] = requested_colname - output_mapping = _output_mapping - - gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - - return TFInputGraph(gdef), input_mapping, output_mapping + @classmethod + def _new_obj_internal(cls): + # pylint: disable=attribute-defined-outside-init + obj = object.__new__(cls) + # TODO: for (de-)serialization, the class should correspond to a ProtocolBuffer definition. + obj.graph_def = None + obj.input_tensor_name_from_signature = None + obj.output_tensor_name_from_signature = None + return obj @classmethod - def fromGraph(cls, graph): + def fromGraph(cls, graph, sess, feed_names, fetch_names): """ Construct a TFInputGraphBuilder from a in memory tf.Graph object """ assert isinstance(graph, tf.Graph), \ ('expect tf.Graph type but got', type(graph)) - def import_graph_fn(sess): - gdef = graph.as_graph_def(add_shapes=True) - with sess.as_default(): - tf.import_graph_def(gdef, name='') - return None # no meta_graph_def + def import_graph_fn(_sess): + assert _sess == sess, 'must have the same session' + return _GinBuilderInfo() - return cls(import_graph_fn) + return _GinBuilder(import_graph_fn, sess, graph).build(feed_names, fetch_names) @classmethod - def fromGraphDef(cls, graph_def): + def fromGraphDef(cls, graph_def, feed_names, fetch_names): """ Construct a TFInputGraphBuilder from a tf.GraphDef object """ @@ -135,16 +66,33 @@ def fromGraphDef(cls, graph_def): def import_graph_fn(sess): with sess.as_default(): tf.import_graph_def(graph_def, name='') - return None + return _GinBuilderInfo() + + return _GinBuilder(import_graph_fn).build(feed_names, fetch_names) + + @classmethod + def fromCheckpoint(cls, checkpoint_dir): + return cls._from_checkpoint_impl(checkpoint_dir, signature_def_key=None) + + @classmethod + def fromCheckpointWithSignature(cls, checkpoint_dir, signature_def_key): + assert signature_def_key is not None + return cls._from_checkpoint_impl(checkpoint_dir, signature_def_key) - return cls(import_graph_fn) + @classmethod + def fromSavedModel(cls, saved_model_dir, tag_set): + return cls._from_saved_model_impl(saved_model_dir, tag_set, signature_def_key=None) + + @classmethod + def fromSavedModelWithSignature(cls, saved_model_dir, tag_set, signature_def_key): + assert signature_def_key is not None + return cls._from_saved_model_impl(saved_model_dir, tag_set, signature_def_key) @classmethod - def fromCheckpoint(cls, checkpoint_dir, signature_def_key=None): + def _from_checkpoint_impl(cls, checkpoint_dir, signature_def_key=None): """ Construct a TFInputGraphBuilder from a model checkpoint """ - def import_graph_fn(sess): # Load checkpoint and import the graph with sess.as_default(): @@ -167,16 +115,15 @@ def import_graph_fn(sess): 'but failed to find it from the meta_graph_def ' + \ 'from checkpoint {}'.format(checkpoint_dir) - return sig_def + return _GinBuilderInfo(sig_def=sig_def) - return cls(import_graph_fn) + return _GinBuilder(import_graph_fn).build() @classmethod - def fromSavedModel(cls, saved_model_dir, tag_set, signature_def_key=None): + def _from_saved_model_impl(cls, saved_model_dir, tag_set, signature_def_key=None): """ Construct a TFInputGraphBuilder from a SavedModel """ - def import_graph_fn(sess): tag_sets = tag_set.split(',') meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) @@ -186,6 +133,92 @@ def import_graph_fn(sess): sig_def = tf.contrib.saved_model.get_signature_def_by_key( meta_graph_def, signature_def_key) - return sig_def - - return cls(import_graph_fn) + return _GinBuilderInfo(sig_def=sig_def) + + return _GinBuilder(import_graph_fn).build() + + +class _GinBuilderInfo(object): + def __init__(self, sig_def=None): + self.sig_def = sig_def + + +class _GinBuilder(object): + def __init__(self, import_graph_fn, sess=None, graph=None): + self.import_graph_fn = import_graph_fn + assert (sess is None) == (graph is None) + self.graph = graph or tf.Graph() + if sess is not None: + self.sess = sess + self._should_clean = True + else: + self.sess = tf.Session(self.graph) + self._should_clean = False + + def _build_impl(self, feed_names, fetch_names): + # pylint: disable=protected-access,attribute-defined-outside-init + gin = TFInputGraph._new_obj_internal() + assert (feed_names is None) == (fetch_names is None) + must_have_sig_def = fetch_names is None + with self.sess.as_default(): + _ginfo = self.import_graph_fn(self.sess) + # TODO: extract signature mappings + if must_have_sig_def: + raise NotImplementedError("cannot extract mappings from sig_def at the moment") + for tnsr_name in feed_names: + assert tfx.get_op(self.graph, tnsr_name) + fetches = [tfx.get_tensor(self.graph, tnsr_name) for tnsr_name in fetch_names] + gin.graph_def = tfx.strip_and_freeze_until(fetches, self.graph, self.sess) + return gin + + def build(self, feed_names=None, fetch_names=None): + try: + gin = self._build_impl(feed_names, fetch_names) + finally: + if self._should_clean: + self.sess.close() + return gin + +# def the_rest(input_mapping, output_mapping): +# graph = tf.Graph() +# with tf.Session(graph=graph) as sess: +# # Append feeds and input mapping +# _input_mapping = {} +# if isinstance(input_mapping, dict): +# input_mapping = input_mapping.items() +# for input_colname, tnsr_or_sig in input_mapping: +# if sig_def: +# tnsr = sig_def.inputs[tnsr_or_sig].name +# else: +# tnsr = tnsr_or_sig +# _input_mapping[input_colname] = tfx.op_name(graph, tnsr) +# input_mapping = _input_mapping + +# # Append fetches and output mapping +# fetches = [] +# _output_mapping = {} +# # By default the output columns will have the name of their +# # corresponding `tf.Graph` operation names. +# # We have to convert them to the user specified output names +# if isinstance(output_mapping, dict): +# output_mapping = output_mapping.items() +# for tnsr_or_sig, requested_colname in output_mapping: +# if sig_def: +# tnsr = sig_def.outputs[tnsr_or_sig].name +# else: +# tnsr = tnsr_or_sig +# fetches.append(tfx.get_tensor(graph, tnsr)) +# tf_output_colname = tfx.op_name(graph, tnsr) +# # NOTE(phi-dbq): put the check here as it will be the entry point to construct +# # a `TFInputGraph` object. +# assert tf_output_colname not in _output_mapping, \ +# "operation {} has multiple output tensors and ".format(tf_output_colname) + \ +# "at least two of them are used in the output DataFrame. " + \ +# "Operation names are used to name columns which leads to conflicts. " + \ +# "You can apply `tf.identity` ops to each to avoid name conflicts." +# _output_mapping[tf_output_colname] = requested_colname +# output_mapping = _output_mapping + +# gdef = tfx.strip_and_freeze_until(fetches, graph, sess) + +# return TFInputGraph(gdef), input_mapping, output_mapping From 889df0aa6808eb491f5f7a79de0154c086d8351a Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 17:16:39 -0700 Subject: [PATCH 16/23] (wip) moving to new API --- python/sparkdl/__init__.py | 4 +- python/sparkdl/graph/input.py | 46 ++- python/sparkdl/param/converters.py | 7 +- python/sparkdl/param/shared_params.py | 2 +- python/sparkdl/transformers/tf_tensor.py | 33 +- python/tests/transformers/tf_tensor_test.py | 353 ++++++++++---------- 6 files changed, 224 insertions(+), 221 deletions(-) diff --git a/python/sparkdl/__init__.py b/python/sparkdl/__init__.py index c5c55ff3..06b91bc8 100644 --- a/python/sparkdl/__init__.py +++ b/python/sparkdl/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. # -from .graph.input import TFInputGraphBuilder +from .graph.input import TFInputGraph from .image.imageIO import imageSchema, imageType, readImages from .transformers.keras_image import KerasImageFileTransformer from .transformers.named_image import DeepImagePredictor, DeepImageFeaturizer @@ -24,6 +24,6 @@ __all__ = [ 'imageSchema', 'imageType', 'readImages', - 'TFImageTransformer', 'TFInputGraphBuilder', 'TFTransformer', + 'TFImageTransformer', 'TFInputGraph', 'TFTransformer', 'DeepImagePredictor', 'DeepImageFeaturizer', 'KerasImageFileTransformer', 'imageInputPlaceholder'] diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index 97e99a33..54375f6c 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from __future__ import absolute_import, division, print_function import tensorflow as tf from tensorflow.core.protobuf import meta_graph_pb2 @@ -20,7 +21,6 @@ __all__ = ["TFInputGraph"] - class TFInputGraph(object): """ An opaque serializable object containing TensorFlow graph. @@ -141,30 +141,58 @@ def import_graph_fn(sess): class _GinBuilderInfo(object): def __init__(self, sig_def=None): self.sig_def = sig_def - + self.feed_names = None + self.feed_mapping = None + self.fetch_names = None + self.fetch_mapping = None + + def extract_signatures(self): + assert self.sig_def is not None, \ + "ask to find sigdef mapping, but not found any" + + self.feed_mapping = {} + self.feed_names = [] + for sigdef_key, tnsr_info in self.sig_def.inputs: + tnsr_name = tnsr_info.name + self.feed_mapping[sigdef_key] = tnsr_name + self.feed_names.append(tnsr_name) + + self.fetch_mapping = {} + self.fetch_names = [] + for sigdef_key, tnsr_info in self.sig_def.outputs: + tnsr_name = tnsr_info.name + self.feed_mapping[sigdef_key] = tnsr_name + self.fetch_names.append(tnsr_name) class _GinBuilder(object): def __init__(self, import_graph_fn, sess=None, graph=None): self.import_graph_fn = import_graph_fn assert (sess is None) == (graph is None) - self.graph = graph or tf.Graph() if sess is not None: + self.graph = graph self.sess = sess - self._should_clean = True - else: - self.sess = tf.Session(self.graph) self._should_clean = False + else: + self.graph = tf.Graph() + self.sess = tf.Session(graph=self.graph) + self._should_clean = True def _build_impl(self, feed_names, fetch_names): # pylint: disable=protected-access,attribute-defined-outside-init gin = TFInputGraph._new_obj_internal() assert (feed_names is None) == (fetch_names is None) must_have_sig_def = fetch_names is None - with self.sess.as_default(): + print('builder-session', repr(self.sess)) + # NOTE(phi-dbq): both have to be set to default + with self.sess.as_default(), self.graph.as_default(): _ginfo = self.import_graph_fn(self.sess) - # TODO: extract signature mappings if must_have_sig_def: - raise NotImplementedError("cannot extract mappings from sig_def at the moment") + _ginfo.extract_signatures() + feed_names = _ginfo.feed_names + fetch_names = _ginfo.fetch_names + gin.input_tensor_name_from_signature = _ginfo.feed_mapping + gin.output_tensor_name_from_signature = _ginfo.fetch_mapping + for tnsr_name in feed_names: assert tfx.get_op(self.graph, tnsr_name) fetches = [tfx.get_tensor(self.graph, tnsr_name) for tnsr_name in fetch_names] diff --git a/python/sparkdl/param/converters.py b/python/sparkdl/param/converters.py index d9b8e512..ae49cc46 100644 --- a/python/sparkdl/param/converters.py +++ b/python/sparkdl/param/converters.py @@ -13,17 +13,12 @@ # limitations under the License. # -from functools import wraps -import six - -import keras import tensorflow as tf from pyspark.ml.param import TypeConverters -from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx -from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder +from sparkdl.graph.input import TFInputGraph import sparkdl.utils.keras_model as kmutil class SparkDLTypeConverters(object): diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index 64d4e6f1..11330270 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -29,7 +29,7 @@ from sparkdl.graph.builder import GraphFunction, IsolatedSession import sparkdl.graph.utils as tfx -from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder +from sparkdl.graph.input import TFInputGraph from sparkdl.param.converters import SparkDLTypeConverters import sparkdl.utils.keras_model as kmutil diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 027c6043..0b4d5c2e 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -21,7 +21,7 @@ from pyspark.ml import Transformer import sparkdl.graph.utils as tfx -from sparkdl.graph.input import TFInputGraph, TFInputGraphBuilder +from sparkdl.graph.input import TFInputGraph from sparkdl.param import (keyword_only, SparkDLTypeConverters, HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams) @@ -57,31 +57,6 @@ def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tf """ super(TFTransformer, self).__init__() kwargs = self._input_kwargs - # The set of parameters either come from some helper functions, - # in which case type(_maybe_gin) is already TFInputGraph. - _maybe_gin = tfInputGraph - if isinstance(_maybe_gin, TFInputGraph): - return self._set(**kwargs) - - # Otherwise, `_maybe_gin` needs to be converted to TFInputGraph - # We put all the conversion logic here rather than in SparkDLTypeConverters - if isinstance(_maybe_gin, TFInputGraphBuilder): - gin = _maybe_gin - elif isinstance(_maybe_gin, tf.Graph): - gin = TFInputGraphBuilder.fromGraph(_maybe_gin) - elif isinstance(_maybe_gin, tf.GraphDef): - gin = TFInputGraphBuilder.fromGraphDef(_maybe_gin) - else: - raise TypeError("TFTransformer expect tfInputGraph convertible to TFInputGraph, " + \ - "but the given type {} cannot be converted, ".format(type(tfInputGraph)) + \ - "please provide `tf.Graph`, `tf.GraphDef` or use one of the " + \ - "`get_params_from_*` helper functions to build parameters") - - gin, input_mapping, output_mapping = gin.build(inputMapping, outputMapping) - kwargs['tfInputGraph'] = gin - kwargs['inputMapping'] = input_mapping - kwargs['outputMapping'] = output_mapping - # Further conanonicalization, e.g. converting dict to sorted str pairs happens here return self._set(**kwargs) @@ -94,11 +69,11 @@ def _transform(self, dataset): with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) - out_tnsr_op_names = [tfx.as_op_name(tnsr_op_name) for tnsr_op_name, _ in output_mapping] + out_tnsr_op_names = [tfx.as_op_name(tnsr_name) for tnsr_name, _ in output_mapping] tf.import_graph_def(graph_def=gin.graph_def, name='', return_elements=out_tnsr_op_names) - feed_dict = dict((tfx.op_name(graph, tnsr_op_name), col_name) - for col_name, tnsr_op_name in input_mapping) + feed_dict = dict((tfx.op_name(graph, tnsr_name), col_name) + for col_name, tnsr_name in input_mapping) fetches = [tfx.get_tensor(graph, tnsr_op_name) for tnsr_op_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index d6fe9296..8cc444da 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -27,7 +27,6 @@ from pyspark.sql.types import Row -from sparkdl.graph.builder import IsolatedSession from sparkdl.graph.input import * import sparkdl.graph.utils as tfx from sparkdl.transformers.tf_tensor import TFTransformer @@ -46,6 +45,8 @@ def setUp(self): self.output_col = 'outputCol' self.output_op_name = 'tnsrOpOut' + self.feed_names = [] + self.fetch_names = [] self.input_mapping = {} self.output_mapping = {} @@ -57,36 +58,39 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.model_output_root, ignore_errors=True) - def build_standard_transformers(self, sess, gin_builder_convertible): + def build_standard_transformers(self, sess, tf_input_graph): def _add_transformer(imap, omap): trnsfmr = TFTransformer( - tfInputGraph=gin_builder_convertible, inputMapping=imap, outputMapping=omap) + tfInputGraph=tf_input_graph, inputMapping=imap, outputMapping=omap) self.transformers.append(trnsfmr) - _add_transformer(self.input_mapping, self.output_mapping) - - imap = [(col, tfx.get_tensor(sess.graph, op_name)) - for col, op_name in self.input_mapping.items()] - omap = [(tfx.get_tensor(sess.graph, op_name), col) - for op_name, col in self.output_mapping.items()] + imap = dict((col, tfx.tensor_name(sess.graph, op_name)) + for col, op_name in self.input_mapping.items()) + omap = dict((tfx.tensor_name(sess.graph, op_name), col) + for op_name, col in self.output_mapping.items()) _add_transformer(imap, omap) - @contextmanager - def run_test_in_tf_session(self, replica=1): - """ [THIS IS NOT A TEST]: encapsulate general test workflow """ - + def setup_iomap(self, replica=1): if replica > 1: for i in range(replica): colname = '{}_replica{:03d}'.format(self.input_col, i) tnsr_op_name = '{}_replica{:03d}'.format(self.input_op_name, i) self.input_mapping[colname] = tnsr_op_name + self.feed_names.append(tnsr_op_name + ':0') colname = '{}_replica{:03d}'.format(self.output_col, i) tnsr_op_name = '{}_replica{:03d}'.format(self.output_op_name, i) self.output_mapping[tnsr_op_name] = colname + self.fetch_names.append(tnsr_op_name + ':0') else: self.input_mapping = {self.input_col: self.input_op_name} + self.feed_names = [self.input_op_name + ':0'] self.output_mapping = {self.output_op_name: self.output_col} + self.fetch_names = [self.output_op_name + ':0'] + + @contextmanager + def _run_test_in_tf_session(self): + """ [THIS IS NOT A TEST]: encapsulate general test workflow """ # Build local features and DataFrame from it local_features = [] @@ -141,174 +145,175 @@ def run_test_in_tf_session(self, replica=1): def test_build_from_tf_graph(self): - with self.run_test_in_tf_session() as sess: + self.setup_iomap(replica=1) + with self._run_test_in_tf_session() as sess: # Begin building graph x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) _ = tf.reduce_mean(x, axis=1, name=self.output_op_name) # End building graph # Begin building transformers - self.build_standard_transformers(sess, sess.graph) - self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) + self.build_standard_transformers( + sess, TFInputGraph.fromGraph(sess.graph, sess, self.feed_names, self.fetch_names)) gdef = sess.graph.as_graph_def() - self.build_standard_transformers(sess, gdef) - self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gdef)) + self.build_standard_transformers( + sess, TFInputGraph.fromGraphDef(gdef, self.feed_names, self.fetch_names)) # End building transformers - def test_build_from_saved_model(self): - # Setup saved model export directory - saved_model_root = self.model_output_root - saved_model_dir = os.path.join(saved_model_root, 'saved_model') - serving_tag = "serving_tag" - serving_sigdef_key = 'prediction_signature' - builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) - - with self.run_test_in_tf_session() as sess: - # Model definition: begin - x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) - w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), - dtype=tf.float64, name='varW') - z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) - # Model definition ends - - sess.run(w.initializer) - - sig_inputs = { - 'input_sig': tf.saved_model.utils.build_tensor_info(x)} - sig_outputs = { - 'output_sig': tf.saved_model.utils.build_tensor_info(z)} - - serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( - inputs=sig_inputs, - outputs=sig_outputs) - - builder.add_meta_graph_and_variables(sess, - [serving_tag], - signature_def_map={ - serving_sigdef_key: serving_sigdef}) - builder.save() - - # Build the transformer from exported serving model - # We are using signaures, thus must provide the keys - tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( - saved_model_dir, serving_tag, serving_sigdef_key, - input_mapping={ - self.input_col: 'input_sig'}, - output_mapping={ - 'output_sig': self.output_col}) - trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - inputMapping=inputMapping, - outputMapping=outputMapping) - self.transformers.append(trans_with_sig) - - # Build the transformer from exported serving model - # We are not using signatures, thus must provide tensor/operation names - gin_builder = TFInputGraphBuilder.fromSavedModel( - saved_model_dir, tag_set=serving_tag, signature_def_key=None) - self.build_standard_transformers(sess, gin_builder) - - - def test_build_from_checkpoint(self): - """ - Test constructing a Transformer from a TensorFlow training checkpoint - """ - # Build the TensorFlow graph - model_ckpt_dir = self.model_output_root - ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') - serving_sigdef_key = 'prediction_signature' - # Warning: please use a new graph for each test cases - # or the tests could affect one another - with self.run_test_in_tf_session() as sess: - x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) - #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) - w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), - dtype=tf.float64, name='varW') - z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) - sess.run(w.initializer) - saver = tf.train.Saver(var_list=[w]) - _ = saver.save(sess, ckpt_path_prefix, global_step=2702) - - # Prepare the signature_def - serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( - inputs={ - 'input_sig': tf.saved_model.utils.build_tensor_info(x) - }, - outputs={ - 'output_sig': tf.saved_model.utils.build_tensor_info(z) - }) - - # A rather contrived way to add signature def to a meta_graph - meta_graph_def = tf.train.export_meta_graph() - - # Find the meta_graph file (there should be only one) - _ckpt_meta_fpaths = glob('{}/*.meta'.format(model_ckpt_dir)) - self.assertEqual(len(_ckpt_meta_fpaths), 1, msg=','.join(_ckpt_meta_fpaths)) - ckpt_meta_fpath = _ckpt_meta_fpaths[0] - - # Add signature_def to the meta_graph and serialize it - # This will overwrite the existing meta_graph_def file - meta_graph_def.signature_def[serving_sigdef_key].CopyFrom(serving_sigdef) - with open(ckpt_meta_fpath, mode='wb') as fout: - fout.write(meta_graph_def.SerializeToString()) - - tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( - model_ckpt_dir, serving_sigdef_key, - input_mapping={ - self.input_col: 'input_sig'}, - output_mapping={ - 'output_sig': self.output_col}) - trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - inputMapping=inputMapping, - outputMapping=outputMapping) - self.transformers.append(trans_with_sig) - - gin_builder = TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir) - self.build_standard_transformers(sess, gin_builder) - - - def test_multi_io(self): - # Build the TensorFlow graph - with self.run_test_in_tf_session(replica=2) as sess: - xs = [] - for tnsr_op_name in self.input_mapping.values(): - x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=tnsr_op_name) - xs.append(x) - - zs = [] - for i, tnsr_op_name in enumerate(self.output_mapping.keys()): - z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) - zs.append(z) - - self.build_standard_transformers(sess, sess.graph) - self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) - - def test_mixed_keras_graph(self): - # Build the graph: the output should have the same leading/batch dimension - with IsolatedSession(using_keras=True) as issn: - tnsr_in = tf.placeholder( - tf.double, shape=[None, self.vec_size], name=self.input_op_name) - inp = tf.expand_dims(tnsr_in, axis=2) - # Keras layers does not take tf.double - inp = tf.cast(inp, tf.float32) - conv = Conv1D(filters=4, kernel_size=2)(inp) - pool = MaxPool1D(pool_size=2)(conv) - flat = Flatten()(pool) - dense = Dense(1)(flat) - # We must keep the leading dimension of the output - redsum = tf.reduce_sum(dense, axis=1) - tnsr_out = tf.cast(redsum, tf.double, name=self.output_op_name) - - # Initialize the variables - init_op = tf.global_variables_initializer() - issn.run(init_op) - # We could train the model ... but skip it here - gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) - - with self.run_test_in_tf_session() as sess: - tf.import_graph_def(gfn.graph_def, name='') - - self.build_standard_transformers(sess, sess.graph) - self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) - self.build_standard_transformers(sess, gfn.graph_def) - self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gfn.graph_def)) + # def test_build_from_saved_model(self): + # # Setup saved model export directory + # saved_model_root = self.model_output_root + # saved_model_dir = os.path.join(saved_model_root, 'saved_model') + # serving_tag = "serving_tag" + # serving_sigdef_key = 'prediction_signature' + # builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) + + # with self.run_test_in_tf_session() as sess: + # # Model definition: begin + # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + # w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), + # dtype=tf.float64, name='varW') + # z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) + # # Model definition ends + + # sess.run(w.initializer) + + # sig_inputs = { + # 'input_sig': tf.saved_model.utils.build_tensor_info(x)} + # sig_outputs = { + # 'output_sig': tf.saved_model.utils.build_tensor_info(z)} + + # serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + # inputs=sig_inputs, + # outputs=sig_outputs) + + # builder.add_meta_graph_and_variables(sess, + # [serving_tag], + # signature_def_map={ + # serving_sigdef_key: serving_sigdef}) + # builder.save() + + # # Build the transformer from exported serving model + # # We are using signaures, thus must provide the keys + # tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( + # saved_model_dir, serving_tag, serving_sigdef_key, + # input_mapping={ + # self.input_col: 'input_sig'}, + # output_mapping={ + # 'output_sig': self.output_col}) + # trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + # inputMapping=inputMapping, + # outputMapping=outputMapping) + # self.transformers.append(trans_with_sig) + + # # Build the transformer from exported serving model + # # We are not using signatures, thus must provide tensor/operation names + # gin_builder = TFInputGraphBuilder.fromSavedModel( + # saved_model_dir, tag_set=serving_tag, signature_def_key=None) + # self.build_standard_transformers(sess, gin_builder) + + + # def test_build_from_checkpoint(self): + # """ + # Test constructing a Transformer from a TensorFlow training checkpoint + # """ + # # Build the TensorFlow graph + # model_ckpt_dir = self.model_output_root + # ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') + # serving_sigdef_key = 'prediction_signature' + # # Warning: please use a new graph for each test cases + # # or the tests could affect one another + # with self.run_test_in_tf_session() as sess: + # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + # #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) + # w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), + # dtype=tf.float64, name='varW') + # z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) + # sess.run(w.initializer) + # saver = tf.train.Saver(var_list=[w]) + # _ = saver.save(sess, ckpt_path_prefix, global_step=2702) + + # # Prepare the signature_def + # serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + # inputs={ + # 'input_sig': tf.saved_model.utils.build_tensor_info(x) + # }, + # outputs={ + # 'output_sig': tf.saved_model.utils.build_tensor_info(z) + # }) + + # # A rather contrived way to add signature def to a meta_graph + # meta_graph_def = tf.train.export_meta_graph() + + # # Find the meta_graph file (there should be only one) + # _ckpt_meta_fpaths = glob('{}/*.meta'.format(model_ckpt_dir)) + # self.assertEqual(len(_ckpt_meta_fpaths), 1, msg=','.join(_ckpt_meta_fpaths)) + # ckpt_meta_fpath = _ckpt_meta_fpaths[0] + + # # Add signature_def to the meta_graph and serialize it + # # This will overwrite the existing meta_graph_def file + # meta_graph_def.signature_def[serving_sigdef_key].CopyFrom(serving_sigdef) + # with open(ckpt_meta_fpath, mode='wb') as fout: + # fout.write(meta_graph_def.SerializeToString()) + + # tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( + # model_ckpt_dir, serving_sigdef_key, + # input_mapping={ + # self.input_col: 'input_sig'}, + # output_mapping={ + # 'output_sig': self.output_col}) + # trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + # inputMapping=inputMapping, + # outputMapping=outputMapping) + # self.transformers.append(trans_with_sig) + + # gin_builder = TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir) + # self.build_standard_transformers(sess, gin_builder) + + + # def test_multi_io(self): + # # Build the TensorFlow graph + # with self.run_test_in_tf_session(replica=2) as sess: + # xs = [] + # for tnsr_op_name in self.input_mapping.values(): + # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=tnsr_op_name) + # xs.append(x) + + # zs = [] + # for i, tnsr_op_name in enumerate(self.output_mapping.keys()): + # z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) + # zs.append(z) + + # self.build_standard_transformers(sess, sess.graph) + # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) + + # def test_mixed_keras_graph(self): + # # Build the graph: the output should have the same leading/batch dimension + # with IsolatedSession(using_keras=True) as issn: + # tnsr_in = tf.placeholder( + # tf.double, shape=[None, self.vec_size], name=self.input_op_name) + # inp = tf.expand_dims(tnsr_in, axis=2) + # # Keras layers does not take tf.double + # inp = tf.cast(inp, tf.float32) + # conv = Conv1D(filters=4, kernel_size=2)(inp) + # pool = MaxPool1D(pool_size=2)(conv) + # flat = Flatten()(pool) + # dense = Dense(1)(flat) + # # We must keep the leading dimension of the output + # redsum = tf.reduce_sum(dense, axis=1) + # tnsr_out = tf.cast(redsum, tf.double, name=self.output_op_name) + + # # Initialize the variables + # init_op = tf.global_variables_initializer() + # issn.run(init_op) + # # We could train the model ... but skip it here + # gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) + + # with self.run_test_in_tf_session() as sess: + # tf.import_graph_def(gfn.graph_def, name='') + + # self.build_standard_transformers(sess, sess.graph) + # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) + # self.build_standard_transformers(sess, gfn.graph_def) + # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gfn.graph_def)) From 86cd6d9310433751232579744843030f8e29781d Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 18:08:57 -0700 Subject: [PATCH 17/23] (wip) enable saved_model tests --- python/sparkdl/graph/input.py | 69 ++++++++++--- python/tests/transformers/tf_tensor_test.py | 104 ++++++++++---------- 2 files changed, 110 insertions(+), 63 deletions(-) diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index 54375f6c..285dd79a 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -41,6 +41,28 @@ def _new_obj_internal(cls): obj.output_tensor_name_from_signature = None return obj + def translateInputMapping(self, input_mapping): + assert self.input_tensor_name_from_signature is not None + _input_mapping = {} + if isinstance(input_mapping, dict): + input_mapping = list(input_mapping.items()) + assert isinstance(input_mapping, list) + for col_name, sig_key in input_mapping: + tnsr_name = self.input_tensor_name_from_signature[sig_key] + _input_mapping[col_name] = tnsr_name + return _input_mapping + + def translateOutputMapping(self, output_mapping): + assert self.output_tensor_name_from_signature is not None + _output_mapping = {} + if isinstance(output_mapping, dict): + output_mapping = list(output_mapping.items()) + assert isinstance(output_mapping, list) + for sig_key, col_name in output_mapping: + tnsr_name = self.output_tensor_name_from_signature[sig_key] + _output_mapping[tnsr_name] = col_name + return _output_mapping + @classmethod def fromGraph(cls, graph, sess, feed_names, fetch_names): """ @@ -71,28 +93,43 @@ def import_graph_fn(sess): return _GinBuilder(import_graph_fn).build(feed_names, fetch_names) @classmethod - def fromCheckpoint(cls, checkpoint_dir): - return cls._from_checkpoint_impl(checkpoint_dir, signature_def_key=None) + def fromCheckpoint(cls, checkpoint_dir, feed_names, fetch_names): + return cls._from_checkpoint_impl(checkpoint_dir, + signature_def_key=None, + feed_names=feed_names, fetch_names=fetch_names) @classmethod def fromCheckpointWithSignature(cls, checkpoint_dir, signature_def_key): assert signature_def_key is not None - return cls._from_checkpoint_impl(checkpoint_dir, signature_def_key) + return cls._from_checkpoint_impl(checkpoint_dir, + signature_def_key, + feed_names=None, fetch_names=None) @classmethod - def fromSavedModel(cls, saved_model_dir, tag_set): - return cls._from_saved_model_impl(saved_model_dir, tag_set, signature_def_key=None) + def fromSavedModel(cls, saved_model_dir, tag_set, feed_names, fetch_names): + return cls._from_saved_model_impl(saved_model_dir, tag_set, + signature_def_key=None, + feed_names=feed_names, fetch_names=fetch_names) @classmethod def fromSavedModelWithSignature(cls, saved_model_dir, tag_set, signature_def_key): assert signature_def_key is not None - return cls._from_saved_model_impl(saved_model_dir, tag_set, signature_def_key) + return cls._from_saved_model_impl(saved_model_dir, tag_set, + signature_def_key=signature_def_key, + feed_names=None, fetch_names=None) @classmethod - def _from_checkpoint_impl(cls, checkpoint_dir, signature_def_key=None): + def _from_checkpoint_impl(cls, + checkpoint_dir, + signature_def_key=None, + feed_names=None, + fetch_names=None): """ Construct a TFInputGraphBuilder from a model checkpoint """ + assert (feed_names is None) == (fetch_names is None) + assert (feed_names is None) or (signature_def_key is None) + def import_graph_fn(sess): # Load checkpoint and import the graph with sess.as_default(): @@ -117,13 +154,19 @@ def import_graph_fn(sess): return _GinBuilderInfo(sig_def=sig_def) - return _GinBuilder(import_graph_fn).build() + return _GinBuilder(import_graph_fn).build(feed_names, fetch_names) @classmethod - def _from_saved_model_impl(cls, saved_model_dir, tag_set, signature_def_key=None): + def _from_saved_model_impl(cls, saved_model_dir, tag_set, + signature_def_key=None, + feed_names=None, + fetch_names=None): """ Construct a TFInputGraphBuilder from a SavedModel """ + assert (feed_names is None) == (fetch_names is None) + assert (feed_names is None) or (signature_def_key is None) + def import_graph_fn(sess): tag_sets = tag_set.split(',') meta_graph_def = tf.saved_model.loader.load(sess, tag_sets, saved_model_dir) @@ -135,7 +178,7 @@ def import_graph_fn(sess): return _GinBuilderInfo(sig_def=sig_def) - return _GinBuilder(import_graph_fn).build() + return _GinBuilder(import_graph_fn).build(feed_names, fetch_names) class _GinBuilderInfo(object): @@ -152,16 +195,16 @@ def extract_signatures(self): self.feed_mapping = {} self.feed_names = [] - for sigdef_key, tnsr_info in self.sig_def.inputs: + for sigdef_key, tnsr_info in self.sig_def.inputs.items(): tnsr_name = tnsr_info.name self.feed_mapping[sigdef_key] = tnsr_name self.feed_names.append(tnsr_name) self.fetch_mapping = {} self.fetch_names = [] - for sigdef_key, tnsr_info in self.sig_def.outputs: + for sigdef_key, tnsr_info in self.sig_def.outputs.items(): tnsr_name = tnsr_info.name - self.feed_mapping[sigdef_key] = tnsr_name + self.fetch_mapping[sigdef_key] = tnsr_name self.fetch_names.append(tnsr_name) class _GinBuilder(object): diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 8cc444da..45622894 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -161,57 +161,61 @@ def test_build_from_tf_graph(self): # End building transformers - # def test_build_from_saved_model(self): - # # Setup saved model export directory - # saved_model_root = self.model_output_root - # saved_model_dir = os.path.join(saved_model_root, 'saved_model') - # serving_tag = "serving_tag" - # serving_sigdef_key = 'prediction_signature' - # builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) - - # with self.run_test_in_tf_session() as sess: - # # Model definition: begin - # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) - # w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), - # dtype=tf.float64, name='varW') - # z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) - # # Model definition ends - - # sess.run(w.initializer) - - # sig_inputs = { - # 'input_sig': tf.saved_model.utils.build_tensor_info(x)} - # sig_outputs = { - # 'output_sig': tf.saved_model.utils.build_tensor_info(z)} - - # serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( - # inputs=sig_inputs, - # outputs=sig_outputs) - - # builder.add_meta_graph_and_variables(sess, - # [serving_tag], - # signature_def_map={ - # serving_sigdef_key: serving_sigdef}) - # builder.save() - - # # Build the transformer from exported serving model - # # We are using signaures, thus must provide the keys - # tfInputGraph, inputMapping, outputMapping = get_params_from_saved_model( - # saved_model_dir, serving_tag, serving_sigdef_key, - # input_mapping={ - # self.input_col: 'input_sig'}, - # output_mapping={ - # 'output_sig': self.output_col}) - # trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - # inputMapping=inputMapping, - # outputMapping=outputMapping) - # self.transformers.append(trans_with_sig) + def test_build_from_saved_model(self): + self.setup_iomap(replica=1) + # Setup saved model export directory + saved_model_root = self.model_output_root + saved_model_dir = os.path.join(saved_model_root, 'saved_model') + serving_tag = "serving_tag" + serving_sigdef_key = 'prediction_signature' + builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir) - # # Build the transformer from exported serving model - # # We are not using signatures, thus must provide tensor/operation names - # gin_builder = TFInputGraphBuilder.fromSavedModel( - # saved_model_dir, tag_set=serving_tag, signature_def_key=None) - # self.build_standard_transformers(sess, gin_builder) + with self._run_test_in_tf_session() as sess: + # Model definition: begin + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), + dtype=tf.float64, name='varW') + z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) + # Model definition ends + + sess.run(w.initializer) + + sig_inputs = { + 'input_sig': tf.saved_model.utils.build_tensor_info(x)} + sig_outputs = { + 'output_sig': tf.saved_model.utils.build_tensor_info(z)} + + serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + inputs=sig_inputs, + outputs=sig_outputs) + + builder.add_meta_graph_and_variables(sess, + [serving_tag], + signature_def_map={ + serving_sigdef_key: serving_sigdef}) + builder.save() + + # Build the transformer from exported serving model + # We are using signaures, thus must provide the keys + tfInputGraph = TFInputGraph.fromSavedModelWithSignature( + saved_model_dir, serving_tag, serving_sigdef_key) + + inputMapping = tfInputGraph.translateInputMapping({ + self.input_col: 'input_sig' + }) + outputMapping = tfInputGraph.translateOutputMapping({ + 'output_sig': self.output_col + }) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) + self.transformers.append(trans_with_sig) + + # Build the transformer from exported serving model + # We are not using signatures, thus must provide tensor/operation names + gin_builder = TFInputGraph.fromSavedModel( + saved_model_dir, serving_tag, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin_builder) # def test_build_from_checkpoint(self): From ac091823b37533756f325ece95c93119056304ba Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 19:55:30 -0700 Subject: [PATCH 18/23] (wip) enable checkpoint test --- python/tests/transformers/tf_tensor_test.py | 130 +++++++++++--------- 1 file changed, 70 insertions(+), 60 deletions(-) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 45622894..d968f797 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -49,12 +49,14 @@ def setUp(self): self.fetch_names = [] self.input_mapping = {} self.output_mapping = {} + self.setup_iomap(replica=1) self.transformers = [] self.test_case_results = [] # Build a temporary directory, which might or might not be used by the test self.model_output_root = tempfile.mkdtemp() + def tearDown(self): shutil.rmtree(self.model_output_root, ignore_errors=True) @@ -71,6 +73,11 @@ def _add_transformer(imap, omap): _add_transformer(imap, omap) def setup_iomap(self, replica=1): + self.input_mapping = {} + self.feed_names = [] + self.output_mapping = {} + self.fetch_names = [] + if replica > 1: for i in range(replica): colname = '{}_replica{:03d}'.format(self.input_col, i) @@ -145,7 +152,7 @@ def _run_test_in_tf_session(self): def test_build_from_tf_graph(self): - self.setup_iomap(replica=1) + """ Build TFTransformer from tf.Graph """ with self._run_test_in_tf_session() as sess: # Begin building graph x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) @@ -162,7 +169,7 @@ def test_build_from_tf_graph(self): def test_build_from_saved_model(self): - self.setup_iomap(replica=1) + """ Build TFTransformer from saved model """ # Setup saved model export directory saved_model_root = self.model_output_root saved_model_dir = os.path.join(saved_model_root, 'saved_model') @@ -213,67 +220,70 @@ def test_build_from_saved_model(self): # Build the transformer from exported serving model # We are not using signatures, thus must provide tensor/operation names - gin_builder = TFInputGraph.fromSavedModel( + gin = TFInputGraph.fromSavedModel( saved_model_dir, serving_tag, self.feed_names, self.fetch_names) - self.build_standard_transformers(sess, gin_builder) + self.build_standard_transformers(sess, gin) - # def test_build_from_checkpoint(self): - # """ - # Test constructing a Transformer from a TensorFlow training checkpoint - # """ - # # Build the TensorFlow graph - # model_ckpt_dir = self.model_output_root - # ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') - # serving_sigdef_key = 'prediction_signature' - # # Warning: please use a new graph for each test cases - # # or the tests could affect one another - # with self.run_test_in_tf_session() as sess: - # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) - # #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) - # w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), - # dtype=tf.float64, name='varW') - # z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) - # sess.run(w.initializer) - # saver = tf.train.Saver(var_list=[w]) - # _ = saver.save(sess, ckpt_path_prefix, global_step=2702) - - # # Prepare the signature_def - # serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( - # inputs={ - # 'input_sig': tf.saved_model.utils.build_tensor_info(x) - # }, - # outputs={ - # 'output_sig': tf.saved_model.utils.build_tensor_info(z) - # }) - - # # A rather contrived way to add signature def to a meta_graph - # meta_graph_def = tf.train.export_meta_graph() - - # # Find the meta_graph file (there should be only one) - # _ckpt_meta_fpaths = glob('{}/*.meta'.format(model_ckpt_dir)) - # self.assertEqual(len(_ckpt_meta_fpaths), 1, msg=','.join(_ckpt_meta_fpaths)) - # ckpt_meta_fpath = _ckpt_meta_fpaths[0] - - # # Add signature_def to the meta_graph and serialize it - # # This will overwrite the existing meta_graph_def file - # meta_graph_def.signature_def[serving_sigdef_key].CopyFrom(serving_sigdef) - # with open(ckpt_meta_fpath, mode='wb') as fout: - # fout.write(meta_graph_def.SerializeToString()) - - # tfInputGraph, inputMapping, outputMapping = get_params_from_checkpoint( - # model_ckpt_dir, serving_sigdef_key, - # input_mapping={ - # self.input_col: 'input_sig'}, - # output_mapping={ - # 'output_sig': self.output_col}) - # trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, - # inputMapping=inputMapping, - # outputMapping=outputMapping) - # self.transformers.append(trans_with_sig) - - # gin_builder = TFInputGraphBuilder.fromCheckpoint(model_ckpt_dir) - # self.build_standard_transformers(sess, gin_builder) + def test_build_from_checkpoint(self): + """ Build TFTransformer from a model checkpoint """ + # Build the TensorFlow graph + model_ckpt_dir = self.model_output_root + ckpt_path_prefix = os.path.join(model_ckpt_dir, 'model_ckpt') + serving_sigdef_key = 'prediction_signature' + + with self._run_test_in_tf_session() as sess: + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=self.input_op_name) + #x = tf.placeholder(tf.float64, shape=[None, vec_size], name=input_col) + w = tf.Variable(tf.random_normal([self.vec_size], dtype=tf.float64), + dtype=tf.float64, name='varW') + z = tf.reduce_mean(x * w, axis=1, name=self.output_op_name) + sess.run(w.initializer) + saver = tf.train.Saver(var_list=[w]) + _ = saver.save(sess, ckpt_path_prefix, global_step=2702) + + # Prepare the signature_def + serving_sigdef = tf.saved_model.signature_def_utils.build_signature_def( + inputs={ + 'input_sig': tf.saved_model.utils.build_tensor_info(x) + }, + outputs={ + 'output_sig': tf.saved_model.utils.build_tensor_info(z) + }) + + # A rather contrived way to add signature def to a meta_graph + meta_graph_def = tf.train.export_meta_graph() + + # Find the meta_graph file (there should be only one) + _ckpt_meta_fpaths = glob('{}/*.meta'.format(model_ckpt_dir)) + self.assertEqual(len(_ckpt_meta_fpaths), 1, msg=','.join(_ckpt_meta_fpaths)) + ckpt_meta_fpath = _ckpt_meta_fpaths[0] + + # Add signature_def to the meta_graph and serialize it + # This will overwrite the existing meta_graph_def file + meta_graph_def.signature_def[serving_sigdef_key].CopyFrom(serving_sigdef) + with open(ckpt_meta_fpath, mode='wb') as fout: + fout.write(meta_graph_def.SerializeToString()) + + # Build the transformer from exported serving model + # We are using signaures, thus must provide the keys + tfInputGraph = TFInputGraph.fromCheckpointWithSignature( + model_ckpt_dir, serving_sigdef_key) + + inputMapping = tfInputGraph.translateInputMapping({ + self.input_col: 'input_sig' + }) + outputMapping = tfInputGraph.translateOutputMapping({ + 'output_sig': self.output_col + }) + trans_with_sig = TFTransformer(tfInputGraph=tfInputGraph, + inputMapping=inputMapping, + outputMapping=outputMapping) + self.transformers.append(trans_with_sig) + + # Transformer without using signature_def + gin = TFInputGraph.fromCheckpoint(model_ckpt_dir, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) # def test_multi_io(self): From 6b22eed89353e32d84d7b8e5cd4197084f70da82 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 19:59:04 -0700 Subject: [PATCH 19/23] (wip) enable multiple tensor tests --- python/tests/transformers/tf_tensor_test.py | 35 ++++++++++++--------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index d968f797..545f374f 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -286,21 +286,28 @@ def test_build_from_checkpoint(self): self.build_standard_transformers(sess, gin) - # def test_multi_io(self): - # # Build the TensorFlow graph - # with self.run_test_in_tf_session(replica=2) as sess: - # xs = [] - # for tnsr_op_name in self.input_mapping.values(): - # x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=tnsr_op_name) - # xs.append(x) - - # zs = [] - # for i, tnsr_op_name in enumerate(self.output_mapping.keys()): - # z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) - # zs.append(z) + def test_multi_io(self): + """ Build TFTransformer with multiple I/O tensors """ + self.setup_iomap(replica=3) + with self._run_test_in_tf_session() as sess: + xs = [] + for tnsr_op_name in self.input_mapping.values(): + x = tf.placeholder(tf.float64, shape=[None, self.vec_size], name=tnsr_op_name) + xs.append(x) + + zs = [] + for i, tnsr_op_name in enumerate(self.output_mapping.keys()): + z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) + zs.append(z) + + gin = TFInputGraph.fromGraph( + sess.graph, sess, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) + + gin = TFInputGraph.fromGraphDef( + sess.graph.as_graph_def(), self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) - # self.build_standard_transformers(sess, sess.graph) - # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) # def test_mixed_keras_graph(self): # # Build the graph: the output should have the same leading/batch dimension From a3517d6f5e3c5f2a30ba5588ae513a132f374224 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 16 Sep 2017 20:14:41 -0700 Subject: [PATCH 20/23] enable all tests --- python/sparkdl/graph/input.py | 57 ++----------- python/tests/transformers/tf_tensor_test.py | 94 +++++++++++---------- 2 files changed, 56 insertions(+), 95 deletions(-) diff --git a/python/sparkdl/graph/input.py b/python/sparkdl/graph/input.py index 285dd79a..53b923c7 100644 --- a/python/sparkdl/graph/input.py +++ b/python/sparkdl/graph/input.py @@ -127,8 +127,10 @@ def _from_checkpoint_impl(cls, """ Construct a TFInputGraphBuilder from a model checkpoint """ - assert (feed_names is None) == (fetch_names is None) - assert (feed_names is None) or (signature_def_key is None) + assert (feed_names is None) == (fetch_names is None), \ + 'feed_names and fetch_names, if provided must appear together' + assert (feed_names is None) != (signature_def_key is None), \ + 'must either provide feed_names or singnature_def_key' def import_graph_fn(sess): # Load checkpoint and import the graph @@ -164,8 +166,10 @@ def _from_saved_model_impl(cls, saved_model_dir, tag_set, """ Construct a TFInputGraphBuilder from a SavedModel """ - assert (feed_names is None) == (fetch_names is None) - assert (feed_names is None) or (signature_def_key is None) + assert (feed_names is None) == (fetch_names is None), \ + 'feed_names and fetch_names, if provided must appear together' + assert (feed_names is None) != (signature_def_key is None), \ + 'must either provide feed_names or singnature_def_key' def import_graph_fn(sess): tag_sets = tag_set.split(',') @@ -225,7 +229,6 @@ def _build_impl(self, feed_names, fetch_names): gin = TFInputGraph._new_obj_internal() assert (feed_names is None) == (fetch_names is None) must_have_sig_def = fetch_names is None - print('builder-session', repr(self.sess)) # NOTE(phi-dbq): both have to be set to default with self.sess.as_default(), self.graph.as_default(): _ginfo = self.import_graph_fn(self.sess) @@ -249,47 +252,3 @@ def build(self, feed_names=None, fetch_names=None): if self._should_clean: self.sess.close() return gin - -# def the_rest(input_mapping, output_mapping): -# graph = tf.Graph() -# with tf.Session(graph=graph) as sess: -# # Append feeds and input mapping -# _input_mapping = {} -# if isinstance(input_mapping, dict): -# input_mapping = input_mapping.items() -# for input_colname, tnsr_or_sig in input_mapping: -# if sig_def: -# tnsr = sig_def.inputs[tnsr_or_sig].name -# else: -# tnsr = tnsr_or_sig -# _input_mapping[input_colname] = tfx.op_name(graph, tnsr) -# input_mapping = _input_mapping - -# # Append fetches and output mapping -# fetches = [] -# _output_mapping = {} -# # By default the output columns will have the name of their -# # corresponding `tf.Graph` operation names. -# # We have to convert them to the user specified output names -# if isinstance(output_mapping, dict): -# output_mapping = output_mapping.items() -# for tnsr_or_sig, requested_colname in output_mapping: -# if sig_def: -# tnsr = sig_def.outputs[tnsr_or_sig].name -# else: -# tnsr = tnsr_or_sig -# fetches.append(tfx.get_tensor(graph, tnsr)) -# tf_output_colname = tfx.op_name(graph, tnsr) -# # NOTE(phi-dbq): put the check here as it will be the entry point to construct -# # a `TFInputGraph` object. -# assert tf_output_colname not in _output_mapping, \ -# "operation {} has multiple output tensors and ".format(tf_output_colname) + \ -# "at least two of them are used in the output DataFrame. " + \ -# "Operation names are used to name columns which leads to conflicts. " + \ -# "You can apply `tf.identity` ops to each to avoid name conflicts." -# _output_mapping[tf_output_colname] = requested_colname -# output_mapping = _output_mapping - -# gdef = tfx.strip_and_freeze_until(fetches, graph, sess) - -# return TFInputGraph(gdef), input_mapping, output_mapping diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index 545f374f..fdeb42ea 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -27,6 +27,7 @@ from pyspark.sql.types import Row +from sparkdl.graph.builder import IsolatedSession from sparkdl.graph.input import * import sparkdl.graph.utils as tfx from sparkdl.transformers.tf_tensor import TFTransformer @@ -60,6 +61,15 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.model_output_root, ignore_errors=True) + def _build_default_session_tests(self, sess): + gin = TFInputGraph.fromGraph( + sess.graph, sess, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) + + gin = TFInputGraph.fromGraphDef( + sess.graph.as_graph_def(), self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) + def build_standard_transformers(self, sess, tf_input_graph): def _add_transformer(imap, omap): trnsfmr = TFTransformer( @@ -113,7 +123,7 @@ def _run_test_in_tf_session(self): # Build the TensorFlow graph graph = tf.Graph() - with tf.Session(graph=graph) as sess: + with tf.Session(graph=graph) as sess, graph.as_default(): # Build test graph and transformers from here yield sess @@ -148,7 +158,7 @@ def _run_test_in_tf_session(self): out_tgt = np.hstack(_results) self.assertTrue(np.allclose(out_ref, out_tgt), - msg=repr(transfomer)) + msg='not close => {} != {}'.format(out_ref.shape, out_tgt.shape)) def test_build_from_tf_graph(self): @@ -159,13 +169,7 @@ def test_build_from_tf_graph(self): _ = tf.reduce_mean(x, axis=1, name=self.output_op_name) # End building graph - # Begin building transformers - self.build_standard_transformers( - sess, TFInputGraph.fromGraph(sess.graph, sess, self.feed_names, self.fetch_names)) - gdef = sess.graph.as_graph_def() - self.build_standard_transformers( - sess, TFInputGraph.fromGraphDef(gdef, self.feed_names, self.fetch_names)) - # End building transformers + self._build_default_session_tests(sess) def test_build_from_saved_model(self): @@ -224,6 +228,10 @@ def test_build_from_saved_model(self): saved_model_dir, serving_tag, self.feed_names, self.fetch_names) self.build_standard_transformers(sess, gin) + gin = TFInputGraph.fromGraph( + sess.graph, sess, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) + def test_build_from_checkpoint(self): """ Build TFTransformer from a model checkpoint """ @@ -285,6 +293,10 @@ def test_build_from_checkpoint(self): gin = TFInputGraph.fromCheckpoint(model_ckpt_dir, self.feed_names, self.fetch_names) self.build_standard_transformers(sess, gin) + gin = TFInputGraph.fromGraph( + sess.graph, sess, self.feed_names, self.fetch_names) + self.build_standard_transformers(sess, gin) + def test_multi_io(self): """ Build TFTransformer with multiple I/O tensors """ @@ -300,41 +312,31 @@ def test_multi_io(self): z = tf.reduce_mean(xs[i], axis=1, name=tnsr_op_name) zs.append(z) - gin = TFInputGraph.fromGraph( - sess.graph, sess, self.feed_names, self.fetch_names) - self.build_standard_transformers(sess, gin) - - gin = TFInputGraph.fromGraphDef( - sess.graph.as_graph_def(), self.feed_names, self.fetch_names) - self.build_standard_transformers(sess, gin) - + self._build_default_session_tests(sess) + + + def test_mixed_keras_graph(self): + """ Build mixed keras graph """ + with IsolatedSession(using_keras=True) as issn: + tnsr_in = tf.placeholder( + tf.double, shape=[None, self.vec_size], name=self.input_op_name) + inp = tf.expand_dims(tnsr_in, axis=2) + # Keras layers does not take tf.double + inp = tf.cast(inp, tf.float32) + conv = Conv1D(filters=4, kernel_size=2)(inp) + pool = MaxPool1D(pool_size=2)(conv) + flat = Flatten()(pool) + dense = Dense(1)(flat) + # We must keep the leading dimension of the output + redsum = tf.reduce_logsumexp(dense, axis=1) + tnsr_out = tf.cast(redsum, tf.double, name=self.output_op_name) + + # Initialize the variables + init_op = tf.global_variables_initializer() + issn.run(init_op) + # We could train the model ... but skip it here + gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) - # def test_mixed_keras_graph(self): - # # Build the graph: the output should have the same leading/batch dimension - # with IsolatedSession(using_keras=True) as issn: - # tnsr_in = tf.placeholder( - # tf.double, shape=[None, self.vec_size], name=self.input_op_name) - # inp = tf.expand_dims(tnsr_in, axis=2) - # # Keras layers does not take tf.double - # inp = tf.cast(inp, tf.float32) - # conv = Conv1D(filters=4, kernel_size=2)(inp) - # pool = MaxPool1D(pool_size=2)(conv) - # flat = Flatten()(pool) - # dense = Dense(1)(flat) - # # We must keep the leading dimension of the output - # redsum = tf.reduce_sum(dense, axis=1) - # tnsr_out = tf.cast(redsum, tf.double, name=self.output_op_name) - - # # Initialize the variables - # init_op = tf.global_variables_initializer() - # issn.run(init_op) - # # We could train the model ... but skip it here - # gfn = issn.asGraphFunction([tnsr_in], [tnsr_out]) - - # with self.run_test_in_tf_session() as sess: - # tf.import_graph_def(gfn.graph_def, name='') - - # self.build_standard_transformers(sess, sess.graph) - # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraph(sess.graph)) - # self.build_standard_transformers(sess, gfn.graph_def) - # self.build_standard_transformers(sess, TFInputGraphBuilder.fromGraphDef(gfn.graph_def)) + with self._run_test_in_tf_session() as sess: + tf.import_graph_def(gfn.graph_def, name='') + self._build_default_session_tests(sess) From 457a4c28d001cccc68f4964afc67d06ca107a870 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 18 Sep 2017 14:06:56 -0700 Subject: [PATCH 21/23] params and converters --- python/sparkdl/param/__init__.py | 8 +- python/sparkdl/param/converters.py | 86 +++++++++++++++++ python/sparkdl/param/shared_params.py | 130 +++++++++++++++----------- 3 files changed, 165 insertions(+), 59 deletions(-) create mode 100644 python/sparkdl/param/converters.py diff --git a/python/sparkdl/param/__init__.py b/python/sparkdl/param/__init__.py index 98a8f7dd..a291a7d4 100644 --- a/python/sparkdl/param/__init__.py +++ b/python/sparkdl/param/__init__.py @@ -14,7 +14,11 @@ # from sparkdl.param.shared_params import ( - keyword_only, HasInputCol, HasOutputCol, HasLabelCol, HasKerasModel, - HasKerasLoss, HasKerasOptimizer, HasOutputNodeName, SparkDLTypeConverters) + keyword_only, HasInputCol, HasOutputCol, HasLabelCol, + # TFTransformer Params + HasInputMapping, HasOutputMapping, HasTFHParams, + # Keras Estimator Params + HasKerasModel, HasKerasLoss, HasKerasOptimizer, HasOutputNodeName) +from sparkdl.param.converters import SparkDLTypeConverters from sparkdl.param.image_params import ( CanLoadImage, HasInputImageNodeName, HasOutputMode, OUTPUT_MODES) diff --git a/python/sparkdl/param/converters.py b/python/sparkdl/param/converters.py new file mode 100644 index 00000000..52f76fb9 --- /dev/null +++ b/python/sparkdl/param/converters.py @@ -0,0 +1,86 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import tensorflow as tf + +from pyspark.ml.param import TypeConverters + +import sparkdl.graph.utils as tfx +import sparkdl.utils.keras_model as kmutil + +__all__ = ['SparkDLTypeConverters'] + +class SparkDLTypeConverters(object): + @staticmethod + def toTFGraph(value): + if isinstance(value, tf.Graph): + return value + else: + raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) + + @staticmethod + def asColumnToTensorMap(value): + if isinstance(value, dict): + strs_pair_seq = [(k, tfx.as_op_name(v)) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def asTensorToColumnMap(value): + if isinstance(value, dict): + strs_pair_seq = [(tfx.as_op_name(k), v) for k, v in value.items()] + return sorted(strs_pair_seq) + raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + + @staticmethod + def toTFHParams(value): + if isinstance(value, tf.contrib.training.HParams): + return value + else: + raise TypeError("Could not convert %s to TensorFlow HParams" % type(value)) + + @staticmethod + def toStringOrTFTensor(value): + if isinstance(value, tf.Tensor): + return value + else: + try: + return TypeConverters.toString(value) + except TypeError: + raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) + + @staticmethod + def supportedNameConverter(supportedList): + def converter(value): + if value in supportedList: + return value + else: + raise TypeError("%s %s is not in the supported list." % type(value), str(value)) + + return converter + + @staticmethod + def toKerasLoss(value): + if kmutil.is_valid_loss_function(value): + return value + raise ValueError( + "Named loss not supported in Keras: {} type({})".format(value, type(value))) + + @staticmethod + def toKerasOptimizer(value): + if kmutil.is_valid_optimizer(value): + return value + raise TypeError( + "Named optimizer not supported in Keras: {} type({})".format(value, type(value))) diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index e169e891..83883235 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -21,13 +21,15 @@ from functools import wraps -import tensorflow as tf - from pyspark.ml.param import Param, Params, TypeConverters -import sparkdl.utils.keras_model as kmutil +from sparkdl.param.converters import SparkDLTypeConverters + -# From pyspark +######################################################## +# Copied from PySpark for backward compatibility. +# They first appeared in Apache Spark version 2.1.1. +######################################################## def keyword_only(func): """ @@ -36,12 +38,14 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper @@ -50,10 +54,8 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasInputCol, self).__init__() + inputCol = Param( + Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) def setInputCol(self, value): """ @@ -73,8 +75,8 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - outputCol = Param(Params._dummy(), - "outputCol", "output column name.", typeConverter=TypeConverters.toString) + outputCol = Param( + Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) def __init__(self): super(HasOutputCol, self).__init__() @@ -92,54 +94,9 @@ def getOutputCol(self): """ return self.getOrDefault(self.outputCol) -############################################ +######################################################## # New in sparkdl -############################################ - -class SparkDLTypeConverters(object): - - @staticmethod - def toStringOrTFTensor(value): - if isinstance(value, tf.Tensor): - return value - else: - try: - return TypeConverters.toString(value) - except TypeError: - raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value)) - - @staticmethod - def toTFGraph(value): - # TODO: we may want to support tf.GraphDef in the future instead of tf.Graph since user - # is less likely to mess up using GraphDef vs Graph (e.g. constants vs variables). - if isinstance(value, tf.Graph): - return value - else: - raise TypeError("Could not convert %s to tensorflow.Graph type" % type(value)) - - @staticmethod - def supportedNameConverter(supportedList): - def converter(value): - if value in supportedList: - return value - else: - raise TypeError("%s %s is not in the supported list." % type(value), str(value)) - - return converter - - @staticmethod - def toKerasLoss(value): - if kmutil.is_valid_loss_function(value): - return value - raise ValueError( - "Named loss not supported in Keras: {} type({})".format(value, type(value))) - - @staticmethod - def toKerasOptimizer(value): - if kmutil.is_valid_optimizer(value): - return value - raise TypeError( - "Named optimizer not supported in Keras: {} type({})".format(value, type(value))) +######################################################## class HasOutputNodeName(Params): @@ -233,3 +190,62 @@ def seKerasLoss(self, value): def getKerasLoss(self): return self.getOrDefault(self.kerasLoss) + + +class HasOutputMapping(Params): + """ + Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs + """ + outputMapping = Param( + Params._dummy(), + "outputMapping", + "Mapping output :class:`tf.Operation` names to DataFrame column names", + typeConverter=SparkDLTypeConverters.asTensorToColumnMap) + + def setOutputMapping(self, value): + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assign `outputMapping` field.") + + def getOutputMapping(self): + return self.getOrDefault(self.outputMapping) + + +class HasInputMapping(Params): + """ + Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs + """ + inputMapping = Param( + Params._dummy(), + "inputMapping", + "Mapping input DataFrame column names to :class:`tf.Operation` names", + typeConverter=SparkDLTypeConverters.asColumnToTensorMap) + + def setInputMapping(self, value): + # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the + # serializable TFInputGraph object once the inputMapping and outputMapping + # parameters are provided. + raise NotImplementedError( + "Please use the Transformer's constructor to assigne `inputMapping` field.") + + def getInputMapping(self): + return self.getOrDefault(self.inputMapping) + + +class HasTFHParams(Params): + """ + Mixin for TensorFlow model hyper-parameters + """ + tfHParams = Param( + Params._dummy(), + "hparams", + "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", + typeConverter=SparkDLTypeConverters.toTFHParams) + + def setTFHParams(self, value): + return self._set(tfHParam=value) + + def getTFHParams(self): + return self.getOrDefault(self.tfHParams) From 323939af11b94554d3b377758a15e9a9257b2a6d Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 18 Sep 2017 14:58:58 -0700 Subject: [PATCH 22/23] tests --- python/sparkdl/param/converters.py | 45 +++++++++++++---- python/sparkdl/param/shared_params.py | 27 +++++------ python/tests/param/__init__.py | 15 ++++++ python/tests/param/params_test.py | 69 +++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 25 deletions(-) create mode 100644 python/tests/param/__init__.py create mode 100644 python/tests/param/params_test.py diff --git a/python/sparkdl/param/converters.py b/python/sparkdl/param/converters.py index 52f76fb9..1a65915a 100644 --- a/python/sparkdl/param/converters.py +++ b/python/sparkdl/param/converters.py @@ -13,6 +13,8 @@ # limitations under the License. # +import six + import tensorflow as tf from pyspark.ml.param import TypeConverters @@ -22,6 +24,35 @@ __all__ = ['SparkDLTypeConverters'] +def _try_convert_tf_tensor_mapping(value, is_key_tf_tensor=True): + if isinstance(value, dict): + strs_pair_seq = [] + for k, v in value.items(): + try: + if is_key_tf_tensor: + _pair = (tfx.as_tensor_name(k), v) + else: + _pair = (k, tfx.as_tensor_name(v)) + except: + err_msg = "Can NOT convert {} (type {}) to tf.Tensor name" + _not_tf_op = k if is_key_tf_tensor else v + raise TypeError(err_msg.format(_not_tf_op, type(_not_tf_op))) + + str_val = v if is_key_tf_tensor else k + if not isinstance(str_val, six.string_types): + err_msg = 'expect string type for {}, but got {}' + raise TypeError(err_msg.format(str_val, type(str_val))) + + strs_pair_seq.append(_pair) + + return sorted(strs_pair_seq) + + if is_key_tf_tensor: + raise TypeError("Could not convert %s to tf.Tensor name to str mapping" % type(value)) + else: + raise TypeError("Could not convert %s to str to tf.Tensor name mapping" % type(value)) + + class SparkDLTypeConverters(object): @staticmethod def toTFGraph(value): @@ -31,18 +62,12 @@ def toTFGraph(value): raise TypeError("Could not convert %s to TensorFlow Graph" % type(value)) @staticmethod - def asColumnToTensorMap(value): - if isinstance(value, dict): - strs_pair_seq = [(k, tfx.as_op_name(v)) for k, v in value.items()] - return sorted(strs_pair_seq) - raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + def asColumnToTensorNameMap(value): + return _try_convert_tf_tensor_mapping(value, is_key_tf_tensor=False) @staticmethod - def asTensorToColumnMap(value): - if isinstance(value, dict): - strs_pair_seq = [(tfx.as_op_name(k), v) for k, v in value.items()] - return sorted(strs_pair_seq) - raise TypeError("Could not convert %s to TensorFlow Tensor" % type(value)) + def asTensorNameToColumnMap(value): + return _try_convert_tf_tensor_mapping(value, is_key_tf_tensor=True) @staticmethod def toTFHParams(value): diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index 83883235..890dc0b3 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -196,11 +196,10 @@ class HasOutputMapping(Params): """ Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs """ - outputMapping = Param( - Params._dummy(), - "outputMapping", - "Mapping output :class:`tf.Operation` names to DataFrame column names", - typeConverter=SparkDLTypeConverters.asTensorToColumnMap) + outputMapping = Param(Params._dummy(), + "outputMapping", + "Mapping output :class:`tf.Operation` names to DataFrame column names", + typeConverter=SparkDLTypeConverters.asTensorNameToColumnMap) def setOutputMapping(self, value): # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the @@ -217,11 +216,10 @@ class HasInputMapping(Params): """ Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs """ - inputMapping = Param( - Params._dummy(), - "inputMapping", - "Mapping input DataFrame column names to :class:`tf.Operation` names", - typeConverter=SparkDLTypeConverters.asColumnToTensorMap) + inputMapping = Param(Params._dummy(), + "inputMapping", + "Mapping input DataFrame column names to :class:`tf.Operation` names", + typeConverter=SparkDLTypeConverters.asColumnToTensorNameMap) def setInputMapping(self, value): # NOTE(phi-dbq): due to the nature of TensorFlow import modes, we can only derive the @@ -238,11 +236,10 @@ class HasTFHParams(Params): """ Mixin for TensorFlow model hyper-parameters """ - tfHParams = Param( - Params._dummy(), - "hparams", - "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", - typeConverter=SparkDLTypeConverters.toTFHParams) + tfHParams = Param(Params._dummy(), + "hparams", + "instance of :class:`tf.contrib.training.HParams`, a key-value map-like object", + typeConverter=SparkDLTypeConverters.toTFHParams) def setTFHParams(self, value): return self._set(tfHParam=value) diff --git a/python/tests/param/__init__.py b/python/tests/param/__init__.py new file mode 100644 index 00000000..7084f22b --- /dev/null +++ b/python/tests/param/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/tests/param/params_test.py b/python/tests/param/params_test.py new file mode 100644 index 00000000..0c10411a --- /dev/null +++ b/python/tests/param/params_test.py @@ -0,0 +1,69 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys + +if sys.version_info[:2] <= (2, 6): + try: + import unittest2 as unittest + except ImportError: + sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') + sys.exit(1) +else: + import unittest + +from sparkdl.param.converters import SparkDLTypeConverters as conv + +class ParamsConverterTest(unittest.TestCase): + # pylint: disable=protected-access + + def test_tf_input_mapping_converter(self): + valid_tnsr_input = {'colA': 'tnsrOpA:0', + 'colB': 'tnsrOpB:0'} + valid_op_input = {'colA': 'tnsrOpA', + 'colB': 'tnsrOpB'} + valid_input_mapping_result = [('colA', 'tnsrOpA:0'), + ('colB', 'tnsrOpB:0')] + + for valid_input_mapping in [valid_op_input, valid_tnsr_input]: + res = conv.asColumnToTensorNameMap(valid_input_mapping) + self.assertEqual(valid_input_mapping_result, res) + + def test_tf_output_mapping_converter(self): + valid_tnsr_output = {'tnsrOpA:0': 'colA', + 'tnsrOpB:0': 'colB'} + valid_op_output = {'tnsrOpA': 'colA', + 'tnsrOpB': 'colB'} + valid_output_mapping_result = [('tnsrOpA:0', 'colA'), + ('tnsrOpB:0', 'colB')] + + for valid_output_mapping in [valid_tnsr_output, valid_op_output]: + res = conv.asTensorNameToColumnMap(valid_output_mapping) + self.assertEqual(valid_output_mapping_result, res) + + + def test_invalid_input_mapping(self): + for invalid in [['a1', 'b2'], ('c3', 'd4'), [('a', 1), ('b', 2)]]: + with self.assertRaises(TypeError): + conv.asColumnToTensorNameMap(invalid) + conv.asTensorNameToColumnMap(invalid) + + with self.assertRaises(TypeError): + # Wrong value type: must be string + conv.asTensorNameToColumnMap({1: 'a', 2.0: 'b'}) + conv.asColumnToTensorNameMap({'a': 1, 'b': 2.0}) + + # Wrong containter type: only accept dict + conv.asColumnToTensorNameMap([('colA', 'tnsrA:0'), ('colB', 'tnsrB:0')]) + conv.asTensorNameToColumnMap([('tnsrA:0', 'colA'), ('tnsrB:0', 'colB')]) From b232b3c9f7a62467a84bd5a1f949bc951e7da1e9 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 18 Sep 2017 16:59:31 -0700 Subject: [PATCH 23/23] optimize graph for inference --- python/sparkdl/graph/builder.py | 8 ++++---- python/sparkdl/transformers/tf_tensor.py | 20 ++++++++++++++++++-- python/tests/transformers/tf_tensor_test.py | 4 +++- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/python/sparkdl/graph/builder.py b/python/sparkdl/graph/builder.py index 86c3b3ce..67510fc7 100644 --- a/python/sparkdl/graph/builder.py +++ b/python/sparkdl/graph/builder.py @@ -47,19 +47,20 @@ def __init__(self, graph=None, using_keras=False): self.graph = graph or tf.Graph() self.sess = tf.Session(graph=self.graph) if using_keras: + self.using_keras = True self.keras_prev_sess = K.get_session() else: + self.using_keras = False self.keras_prev_sess = None def __enter__(self): - self.sess.as_default() self.sess.__enter__() - if self.keras_prev_sess is not None: + if self.using_keras: K.set_session(self.sess) return self def __exit__(self, *args): - if self.keras_prev_sess is not None: + if self.using_keras: K.set_session(self.keras_prev_sess) self.sess.__exit__(*args) @@ -268,4 +269,3 @@ def fromList(cls, functions): gfn = issn.asGraphFunction(first_inputs, last_outputs) return gfn - diff --git a/python/sparkdl/transformers/tf_tensor.py b/python/sparkdl/transformers/tf_tensor.py index 0b4d5c2e..8b11d9af 100644 --- a/python/sparkdl/transformers/tf_tensor.py +++ b/python/sparkdl/transformers/tf_tensor.py @@ -16,6 +16,7 @@ import logging import tensorflow as tf +from tensorflow.python.tools import optimize_for_inference_lib as infr_opt import tensorframes as tfs from pyspark.ml import Transformer @@ -60,17 +61,32 @@ def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tf # Further conanonicalization, e.g. converting dict to sorted str pairs happens here return self._set(**kwargs) - def _transform(self, dataset): + def _optimize_for_inference(self): + """ Optimize the graph for inference """ gin = self.getTFInputGraph() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() + input_node_names = [tfx.as_op_name(tnsr_name) for _, tnsr_name in input_mapping] + output_node_names = [tfx.as_op_name(tnsr_name) for tnsr_name, _ in output_mapping] + + # NOTE(phi-dbq): Spark DataFrame assumes float64 as default floating point type + opt_gdef = infr_opt.optimize_for_inference(gin.graph_def, + input_node_names, + output_node_names, + tf.float64.as_datatype_enum) + return opt_gdef + + def _transform(self, dataset): + graph_def = self._optimize_for_inference() + input_mapping = self.getInputMapping() + output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.as_op_name(tnsr_name) for tnsr_name, _ in output_mapping] - tf.import_graph_def(graph_def=gin.graph_def, name='', return_elements=out_tnsr_op_names) + tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) feed_dict = dict((tfx.op_name(graph, tnsr_name), col_name) for col_name, tnsr_name in input_mapping) diff --git a/python/tests/transformers/tf_tensor_test.py b/python/tests/transformers/tf_tensor_test.py index fdeb42ea..c20a8e72 100644 --- a/python/tests/transformers/tf_tensor_test.py +++ b/python/tests/transformers/tf_tensor_test.py @@ -157,8 +157,10 @@ def _run_test_in_tf_session(self): _results.append(np.ravel(curr_res)) out_tgt = np.hstack(_results) + err_msg = 'not close => {} != {}, max_diff {}' self.assertTrue(np.allclose(out_ref, out_tgt), - msg='not close => {} != {}'.format(out_ref.shape, out_tgt.shape)) + msg=err_msg.format(out_ref.shape, out_tgt.shape, + np.max(np.abs(out_ref - out_tgt)))) def test_build_from_tf_graph(self):