Beautifier layers doc (#2117)

bot-of-gabrieldemarmiesse · github-actions[bot] · web-flow · commit 84f03d6d687e · 2020-08-26T19:37:59.000-07:00
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/tensorflow_addons/layers/multihead_attention.py b/tensorflow_addons/layers/multihead_attention.py
@@ -20,49 +20,48 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class MultiHeadAttention(tf.keras.layers.Layer):
-    r"""
-    MultiHead Attention layer.
+    r"""MultiHead Attention layer.
 
     Defines the MultiHead Attention operation as described in
     [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes
     in the tensors `query`, `key`, and `value`, and returns the dot-product attention
     between them:
 
-        ```python
-        mha = MultiHeadAttention(head_size=128, num_heads=12)
+    ```python
+    mha = MultiHeadAttention(head_size=128, num_heads=12)
 
-        query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth)
-        key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth)
-        value = tf.random.uniform((32, 15, 400)) # (batch_size, key_elements, value_depth)
+    query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth)
+    key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth)
+    value = tf.random.uniform((32, 15, 400)) # (batch_size, key_elements, value_depth)
 
-        attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
-        ```
+    attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
+    ```
 
     If `value` is not given then internally `value = key` will be used:
 
-         ```python
-        mha = MultiHeadAttention(head_size=128, num_heads=12)
+    ```python
+    mha = MultiHeadAttention(head_size=128, num_heads=12)
 
-        query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth)
-        key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth)
+    query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth)
+    key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth)
 
-        attention = mha([query, key]) # (batch_size, query_elements, key_depth)
-        ```
+    attention = mha([query, key]) # (batch_size, query_elements, key_depth)
+    ```
 
     Arguments:
         head_size: int, dimensionality of the `query`, `key` and `value` tensors
-        after the linear transformation.
+            after the linear transformation.
         num_heads: int, number of attention heads.
         output_size: int, dimensionality of the output space, if `None` then the
-        input dimension of
-        `value` or `key` will be used, default `None`.
+            input dimension of `value` or `key` will be used,
+            default `None`.
         dropout: float, `rate` parameter for the dropout layer that is
-        applied to attention after softmax,
+            applied to attention after softmax,
         default `0`.
         use_projection_bias: bool, whether to use a bias term after the linear
-        output projection.
+            output projection.
         return_attn_coef: bool, if `True`, return the attention coefficients as
-        an additional output argument.
+            an additional output argument.
         kernel_initializer: initializer, initializer for the kernel weights.
         kernel_regularizer: regularizer, regularizer for the kernel weights.
         kernel_constraint: constraint, constraint for the kernel weights.
diff --git a/tensorflow_addons/layers/netvlad.py b/tensorflow_addons/layers/netvlad.py
@@ -23,13 +23,11 @@
 class NetVLAD(tf.keras.layers.Layer):
     """Applies NetVLAD to the input.
 
-        This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image
-        retrieval. It is also used in audio retrieval, and audio represenation learning (ex
-        "Towards Learning a Universal Non-Semantic Representation of Speech", https://arxiv.org/abs/2002.12764).
+    This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image
+    retrieval.
 
-        "NetVLAD: CNN architecture for weakly supervised place recognition"
-        Relja Arandjelovic, Petr Gronat, Akihiko Torii, Tomas Pajdla, Josef Sivic.
-        https://arxiv.org/abs/1511.07247
+    See [NetVLAD: CNN architecture for weakly supervised place recognition](https://arxiv.org/abs/1511.07247), and.
+    [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764)
 
     Arguments:
         num_clusters: The number of clusters to use.
diff --git a/tensorflow_addons/layers/normalizations.py b/tensorflow_addons/layers/normalizations.py
@@ -42,7 +42,7 @@ class GroupNormalization(tf.keras.layers.Layer):
     to number of channels), then this operation becomes
     identical to Instance Normalization.
 
-    Arguments
+    Arguments:
         groups: Integer, the number of groups for Group Normalization.
             Can be in the range [1, N] where N is the input dimension.
             The input dimension must be divisible by the number of groups.
@@ -59,14 +59,15 @@ class GroupNormalization(tf.keras.layers.Layer):
         beta_constraint: Optional constraint for the beta weight.
         gamma_constraint: Optional constraint for the gamma weight.
 
-    Input shape
+    Input shape:
         Arbitrary. Use the keyword argument `input_shape`
         (tuple of integers, does not include the samples axis)
         when using this layer as the first layer in a model.
 
-    Output shape
+    Output shape:
         Same shape as input.
-    References
+
+    References:
         - [Group Normalization](https://arxiv.org/abs/1803.08494)
     """
 
diff --git a/tensorflow_addons/layers/optical_flow.py b/tensorflow_addons/layers/optical_flow.py
@@ -34,10 +34,7 @@ def _correlation_cost(
 ):
     """Correlation Cost Volume computation.
 
-    "FlowNet: Learning Optical Flow with Convolutional Networks"
-    Philipp Fischer, Alexey Dosovitskiy, Eddy Ilg, Philip Hausser,
-    Caner Hazirbas, Vladimir Golkov, Patrick van der Smagt,
-    Daniel Cremers, Thomas Brox. https://arxiv.org/abs/1504.06852
+    See [FlowNet: Learning Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852).
 
     Computes a cost volume using correlation for two inputs. For feature
     maps A, B with spatial dimensions w, h, c it computes
@@ -142,9 +139,8 @@ def _correlation_cost_grad(op, grad_output):
 class CorrelationCost(tf.keras.layers.Layer):
     """Correlation Cost Layer.
 
-    This layer implements the correlation operation from FlowNet Learning
-    Optical Flow with Convolutional Networks (Fischer et al.):
-    https://arxiv.org/abs/1504.06
+    This layer implements the correlation operation from [FlowNet Learning
+    Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06)(Fischer et al.).
 
     Args:
         kernel_size: An integer specifying the height and width of the
diff --git a/tensorflow_addons/layers/poincare.py b/tensorflow_addons/layers/poincare.py
@@ -21,12 +21,10 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class PoincareNormalize(tf.keras.layers.Layer):
-    """Project into the Poincare ball with norm <= 1.0 - epsilon.
+    """Project into the Poincare ball with `norm <= 1.0 - epsilon`.
 
-    https://en.wikipedia.org/wiki/Poincare_ball_model
-
-    Used in Poincare Embeddings for Learning Hierarchical Representations
-    Maximilian Nickel, Douwe Kiela https://arxiv.org/pdf/1705.08039.pdf
+    See [Poincaré Embeddings for Learning Hierarchical Representations](https://arxiv.org/pdf/1705.08039.pdf),
+    and [wiki](https://en.wikipedia.org/wiki/Poincare_ball_model).
 
     For a 1-D tensor with `axis = 0`, computes
 
diff --git a/tensorflow_addons/layers/polynomial.py b/tensorflow_addons/layers/polynomial.py
@@ -32,24 +32,23 @@ class PolynomialCrossing(tf.keras.layers.Layer):
     is the output of the previous `PolynomialCrossing` layer in the stack, i.e.,
     the i-th `PolynomialCrossing` layer.
 
-    The output is x_{i+1} = x0 .* (W * x_i + diag_scale * x_i) + bias + xi, where .* designates elementwise
-    multiplication, W could be a full rank matrix, or a low rank matrix U*V to reduce the computational cost,
+    The output is `x[i+1] = x0 .* (W * x[i] + diag_scale * x[i]) + bias + x[i]`, where .* designates elementwise
+    multiplication, W could be a full rank matrix, or a low rank matrix `U*V` to reduce the computational cost,
     and diag_scale increases the diagonal of W to improve training stability (especially for the low rank case).
 
-    References
-        See [R. Wang](https://arxiv.org/pdf/1708.05123.pdf)
+    See [Deep & Cross Network for Ad Click Predictions](https://arxiv.org/pdf/1708.05123.pdf).
 
     Example:
 
-        ```python
-        # after embedding layer in a functional model:
-        input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
-        x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6))
-        x1 = PolynomialCrossing(projection_dim=None)((x0, x0))
-        x2 = PolynomialCrossing(projection_dim=None)((x0, x1))
-        logits = tf.keras.layers.Dense(units=10)(x2)
-        model = tf.keras.Model(input, logits)
-        ```
+    ```python
+    # after embedding layer in a functional model:
+    input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
+    x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6))
+    x1 = PolynomialCrossing(projection_dim=None)((x0, x0))
+    x2 = PolynomialCrossing(projection_dim=None)((x0, x1))
+    logits = tf.keras.layers.Dense(units=10)(x2)
+    model = tf.keras.Model(input, logits)
+    ```
 
     Arguments:
         projection_dim: project dimension to reduce the computational cost.
@@ -69,10 +68,10 @@ class PolynomialCrossing(tf.keras.layers.Layer):
         bias_regularizer: Regularizer instance to use on bias vector.
 
     Input shape:
-        A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
+        A tuple of 2 `(batch_size, input_dim)` dimensional inputs.
 
     Output shape:
-        A single (batch_size, `input_dim`) dimensional output.
+        A single `(batch_size, input_dim)` dimensional output.
     """
 
     @typechecked
diff --git a/tensorflow_addons/layers/snake.py b/tensorflow_addons/layers/snake.py
@@ -26,7 +26,7 @@
 class Snake(tf.keras.layers.Layer):
     """Snake layer to learn periodic functions with the trainable `frequency` scalar.
 
-    https://arxiv.org/abs/2006.08195
+    See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195).
 
     Arguments:
         frequency_initializer: Initializer for the `frequency` scalar.
diff --git a/tensorflow_addons/layers/sparsemax.py b/tensorflow_addons/layers/sparsemax.py
@@ -20,11 +20,11 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class Sparsemax(tf.keras.layers.Layer):
-    """Sparsemax activation function [1].
+    """Sparsemax activation function.
 
     The output shape is the same as the input shape.
 
-    [1]: https://arxiv.org/abs/1602.02068
+    See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068).
 
     Arguments:
         axis: Integer, axis along which the sparsemax normalization is applied.
diff --git a/tensorflow_addons/layers/spatial_pyramid_pooling.py b/tensorflow_addons/layers/spatial_pyramid_pooling.py
@@ -26,7 +26,7 @@
 class SpatialPyramidPooling2D(tf.keras.layers.Layer):
     """Performs Spatial Pyramid Pooling.
 
-    Original Paper: https://arxiv.org/pdf/1406.4729.pdf
+    See [Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition](https://arxiv.org/pdf/1406.4729.pdf).
 
     Spatial Pyramid Pooling generates a fixed-length representation
     regardless of input size/scale. It is typically used before a layer
diff --git a/tensorflow_addons/layers/spectral_normalization.py b/tensorflow_addons/layers/spectral_normalization.py
@@ -19,30 +19,33 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class SpectralNormalization(tf.keras.layers.Wrapper):
-    """This wrapper controls the Lipschitz constant of the layer by
-       constraining its spectral norm.
-    This stabilizes the training of GANs.
-    Spectral Normalization for Generative Adversarial Networks:
-    https://arxiv.org/abs/1802.05957
-    Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida (2018)
-    SpectralNormalization wrapper works for keras and tf layers.
+    """Performs spectral normalization on weights.
+
+    This wrapper controls the Lipschitz constant of the layer by
+    constraining its spectral norm, which can stabilize the training of GANs.
+
+    See [Spectral Normalization for Generative Adversarial Networks](https://arxiv.org/abs/1802.05957).
+
     ```python
-      net = SpectralNormalization(
-          tf.keras.layers.Conv2D(2, 2, activation="relu"),
-          input_shape=(32, 32, 3))(x)
-      net = SpectralNormalization(
-          tf.keras.layers.Conv2D(16, 5, activation="relu"))(net)
-      net = SpectralNormalization(
-          tf.keras.layers.Dense(120, activation="relu"))(net)
-      net = SpectralNormalization(
-          tf.keras.layers.Dense(n_classes))(net)
+    net = SpectralNormalization(
+        tf.keras.layers.Conv2D(2, 2, activation="relu"),
+        input_shape=(32, 32, 3))(x)
+    net = SpectralNormalization(
+        tf.keras.layers.Conv2D(16, 5, activation="relu"))(net)
+    net = SpectralNormalization(
+        tf.keras.layers.Dense(120, activation="relu"))(net)
+    net = SpectralNormalization(
+        tf.keras.layers.Dense(n_classes))(net)
     ```
+
     Arguments:
-      layer: a layer instance.
+      layer: A `tf.keras.layers.Layer` instance that
+        has either `kernel` or `embeddings` attribute.
+      power_iterations: `int`, the number of iterations during normalization.
     Raises:
       AssertionError: If not initialized with a `Layer` instance.
-      ValueError: If initialized with negative `power_iterations`
-      AttributeError: If `Layer` does not contain a `kernel` or `embeddings` of weights
+      ValueError: If initialized with negative `power_iterations`.
+      AttributeError: If `layer` does not has `kernel` or `embeddings` attribute.
     """
 
     @typechecked
@@ -99,8 +102,9 @@ def compute_output_shape(self, input_shape):
     @tf.function
     def normalize_weights(self):
         """Generate spectral normalized weights.
-        This method will update the value of self.w with the
-        spectral normalized value, so that the layer is ready for call().
+
+        This method will update the value of `self.w` with the
+        spectral normalized value, so that the layer is ready for `call()`.
         """
 
         w = tf.reshape(self.w, [-1, self.w_shape[-1]])
diff --git a/tensorflow_addons/layers/tlu.py b/tensorflow_addons/layers/tlu.py
@@ -22,7 +22,9 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class TLU(tf.keras.layers.Layer):
-    """Thresholded Linear Unit. An activation function which is similar to ReLU
+    r"""Thresholded Linear Unit.
+
+    An activation function which is similar to ReLU
     but with a learned threshold that benefits models using FRN(Filter Response
     Normalization). Original paper: https://arxiv.org/pdf/1911.09737.
 
@@ -35,8 +37,8 @@ class TLU(tf.keras.layers.Layer):
         Same shape as the input.
 
     Arguments:
-        affine: bool. Whether to make it TLU-Affine or not
-        which has the form `max(x, alpha*x + tau)`
+        affine: `bool`. Whether to make it TLU-Affine or not
+            which has the form $\max(x, \alpha*x + \tau)$`
     """
 
     @typechecked
diff --git a/tensorflow_addons/layers/wrappers.py b/tensorflow_addons/layers/wrappers.py
@@ -21,37 +21,38 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class WeightNormalization(tf.keras.layers.Wrapper):
-    """This wrapper reparameterizes a layer by decoupling the weight's
-    magnitude and direction.
+    """Performs weight normalization.
 
+    This wrapper reparameterizes a layer by decoupling the weight's
+    magnitude and direction.
     This speeds up convergence by improving the
     conditioning of the optimization problem.
-    Weight Normalization: A Simple Reparameterization to Accelerate
-    Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868
-    Tim Salimans, Diederik P. Kingma (2016)
-    WeightNormalization wrapper works for keras and tf layers.
+
+    See [Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks](https://arxiv.org/abs/1602.07868).
+
     ```python
-      net = WeightNormalization(
-          tf.keras.layers.Conv2D(2, 2, activation='relu'),
-          input_shape=(32, 32, 3),
-          data_init=True)(x)
-      net = WeightNormalization(
-          tf.keras.layers.Conv2D(16, 5, activation='relu'),
-          data_init=True)(net)
-      net = WeightNormalization(
-          tf.keras.layers.Dense(120, activation='relu'),
-          data_init=True)(net)
-      net = WeightNormalization(
-          tf.keras.layers.Dense(n_classes),
-          data_init=True)(net)
+    net = WeightNormalization(
+        tf.keras.layers.Conv2D(2, 2, activation='relu'),
+        input_shape=(32, 32, 3),
+        data_init=True)(x)
+    net = WeightNormalization(
+        tf.keras.layers.Conv2D(16, 5, activation='relu'),
+        data_init=True)(net)
+    net = WeightNormalization(
+        tf.keras.layers.Dense(120, activation='relu'),
+        data_init=True)(net)
+    net = WeightNormalization(
+        tf.keras.layers.Dense(n_classes),
+        data_init=True)(net)
     ```
+
     Arguments:
-      layer: a layer instance.
-      data_init: If `True` use data dependent variable initialization
+      layer: A `tf.keras.layers.Layer` instance.
+      data_init: If `True` use data dependent variable initialization.
     Raises:
       ValueError: If not initialized with a `Layer` instance.
-      ValueError: If `Layer` does not contain a `kernel` of weights
-      NotImplementedError: If `data_init` is True and running graph execution
+      ValueError: If `Layer` does not contain a `kernel` of weights.
+      NotImplementedError: If `data_init` is True and running graph execution.
     """
 
     @typechecked