From 60917f41e8bee8f0e4cd14538e9a0926a1575b63 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 30 Apr 2020 10:28:22 +0800
Subject: [PATCH 01/16] Add test for text.py

---
 examples/transformer/transformer.py |  26 +-
 hapi/model.py                       |  21 +-
 hapi/tests/test_text.py             | 508 ++++++++++++++++++++++++++++
 hapi/text/text.py                   |   6 +-
 4 files changed, 525 insertions(+), 36 deletions(-)
 create mode 100644 hapi/tests/test_text.py

diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py
index 30bb931..b2ec120 100644
--- a/examples/transformer/transformer.py
+++ b/examples/transformer/transformer.py
@@ -18,10 +18,10 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from hapi.model import Model, CrossEntropy, Loss
-from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
+from hapi.text import TransformerCell, TransformerBeamSearchDecoder, DynamicDecode
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -606,26 +606,6 @@ def forward(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
         return predict
 
 
-class TransfomerCell(object):
-    """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
-    """
-
-    def __init__(self, decoder):
-        self.decoder = decoder
-
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
-
-
 class InferTransformer(Transformer):
     """
     model for prediction
@@ -657,7 +637,7 @@ def __init__(self,
         self.beam_size = args.pop("beam_size")
         self.max_out_len = args.pop("max_out_len")
         super(InferTransformer, self).__init__(**args)
-        cell = TransfomerCell(self.decoder)
+        cell = TransformerCell(self.decoder)
         self.beam_search_decoder = DynamicDecode(
             TransformerBeamSearchDecoder(
                 cell, bos_id, eos_id, beam_size, var_dim_in_state=2),
diff --git a/hapi/model.py b/hapi/model.py
index 8c1c521..d825d5c 100644
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -38,7 +38,7 @@
 from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
 from hapi.metrics import Metric
 from hapi.callbacks import config_callbacks
-from hapi.utils import to_list, to_numpy, flatten_list, restore_flatten_list
+from hapi.utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
 
 __all__ = [
     'Model',
@@ -495,14 +495,15 @@ def train_batch(self, inputs, labels=None):
         if labels is not None:
             labels = [to_variable(l) for l in to_list(labels)]
         if self._nranks > 1:
-            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
+            outputs = self.ddp_model.forward(
+                * [to_variable(x) for x in inputs])
             losses = self.model._loss_function(outputs, labels)
             final_loss = fluid.layers.sum(losses)
             final_loss = self.ddp_model.scale_loss(final_loss)
             final_loss.backward()
             self.ddp_model.apply_collective_grads()
         else:
-            outputs = self.model.forward(*[to_variable(x) for x in inputs])
+            outputs = self.model.forward(* [to_variable(x) for x in inputs])
             losses = self.model._loss_function(outputs, labels)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
@@ -511,9 +512,9 @@ def train_batch(self, inputs, labels=None):
         self.model.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+                labels)))
+            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         return ([to_numpy(l) for l in losses], metrics) \
@@ -525,7 +526,7 @@ def eval_batch(self, inputs, labels=None):
         inputs = to_list(inputs)
         if labels is not None:
             labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(*[to_variable(x) for x in inputs])
+        outputs = self.model.forward(* [to_variable(x) for x in inputs])
         if self.model._loss_function:
             losses = self.model._loss_function(outputs, labels)
         else:
@@ -551,9 +552,9 @@ def eval_batch(self, inputs, labels=None):
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
 
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+                labels)))
+            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         # To be consistent with static graph
diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
new file mode 100644
index 0000000..46efbf6
--- /dev/null
+++ b/hapi/tests/test_text.py
@@ -0,0 +1,508 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+import time
+import random
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, Linear, Layer
+from paddle.fluid.layers import BeamSearchDecoder
+import hapi.text as text
+from hapi.model import Model, Input, set_device
+from hapi.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder
+from hapi.text import *
+
+
+def sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh(x):
+    return 2. * sigmoid(2. * x) - 1.
+
+
+def lstm_step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0):
+    concat_1 = np.concatenate([step_in, pre_hidden], 1)
+
+    gate_input = np.matmul(concat_1, gate_w)
+    gate_input += gate_b
+    i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
+
+    new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
+    new_hidden = tanh(new_cell) * sigmoid(o)
+
+    return new_hidden, new_cell
+
+
+def gru_step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
+    concat_1 = np.concatenate([step_in, pre_hidden], 1)
+
+    gate_input = np.matmul(concat_1, gate_w)
+    gate_input += gate_b
+    gate_input = sigmoid(gate_input)
+    r, u = np.split(gate_input, indices_or_sections=2, axis=1)
+
+    r_hidden = r * pre_hidden
+
+    candidate = np.matmul(np.concatenate([step_in, r_hidden], 1), candidate_w)
+
+    candidate += candidate_b
+    c = tanh(candidate)
+
+    new_hidden = u * pre_hidden + (1 - u) * c
+
+    return new_hidden
+
+
+class ModuleApiTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        cls._random_seed = 123
+        np.random.seed(cls._random_seed)
+        random.seed(cls._random_seed)
+
+        cls.model_cls = type(cls.__name__ + "Model", (Model, ), {
+            "__init__": cls.model_init_wrapper(cls.model_init),
+            "forward": cls.model_forward
+        })
+
+    @classmethod
+    def tearDownClass(cls):
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+    @staticmethod
+    def model_init_wrapper(func):
+        def __impl__(self, *args, **kwargs):
+            Model.__init__(self)
+            func(self, *args, **kwargs)
+
+        return __impl__
+
+    @staticmethod
+    def model_init(self, *args, **kwargs):
+        raise NotImplementedError(
+            "model_init acts as `Model.__init__`, thus must implement it")
+
+    @staticmethod
+    def model_forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+
+    def make_inputs(self):
+        # TODO(guosheng): add default from `self.inputs`
+        raise NotImplementedError(
+            "model_inputs makes inputs for model, thus must implement it")
+
+    def setUp(self):
+        """
+        For the model which wraps the module to be tested:
+            Set input data by `self.inputs` list
+            Set init argument values by `self.attrs` list/dict
+            Set model parameter values by `self.param_states` dict
+            Set expected output data by `self.outputs` list
+        We can create a model instance and run once with these.
+        """
+        self.inputs = []
+        self.attrs = {}
+        self.param_states = {}
+        self.outputs = []
+
+    def _calc_output(self, place, mode="test", dygraph=True):
+        if dygraph:
+            fluid.enable_dygraph(place)
+        else:
+            fluid.disable_dygraph()
+        fluid.default_main_program().random_seed = self._random_seed
+        fluid.default_startup_program().random_seed = self._random_seed
+        model = self.model_cls(**self.attrs) if isinstance(
+            self.attrs, dict) else self.model_cls(*self.attrs)
+        model.prepare(inputs=self.make_inputs(), device=place)
+        if self.param_states:
+            model.load(self.param_states, optim_state=None)
+        return model.test_batch(self.inputs)
+
+    def check_output_with_place(self, place, mode="test"):
+        dygraph_output = self._calc_output(place, mode, dygraph=True)
+        stgraph_output = self._calc_output(place, mode, dygraph=False)
+        expect_output = getattr(self, "outputs", None)
+        for actual_t, expect_t in zip(dygraph_output, stgraph_output):
+            self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
+        if expect_output:
+            for actual_t, expect_t in zip(dygraph_output, expect_output):
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, rtol=1e-5, atol=0))
+
+    def check_output(self):
+        devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
+        for device in devices:
+            place = set_device(device)
+            self.check_output_with_place(place)
+
+
+class TestBasicLSTM(ModuleApiTest):
+    def setUp(self):
+        # TODO(guosheng): Change to big size. Currentlys bigger hidden size for
+        # LSTM would fail, the second static graph run might get diff output
+        # with others.
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size):
+        self.lstm = RNN(
+            BasicLSTMCell(
+                input_size,
+                hidden_size,
+                param_attr=fluid.ParamAttr(name="lstm_weight"),
+                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBasicGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 128)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size):
+        self.gru = RNN(BasicGRUCell(input_size, hidden_size))
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.gru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBeamSearch(ModuleApiTest):
+    def setUp(self):
+        shape = (8, 32)
+        self.inputs = [
+            np.random.random(shape).astype("float32"),
+            np.random.random(shape).astype("float32")
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "embed_dim": 32,
+            "hidden_size": 32,
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   embed_dim,
+                   hidden_size,
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        embedder = Embedding(size=[vocab_size, embed_dim])
+        output_layer = Linear(hidden_size, vocab_size)
+        cell = BasicLSTMCell(embed_dim, hidden_size)
+        decoder = BeamSearchDecoder(
+            cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=embedder,
+            output_fn=output_layer)
+        self.beam_search_decoder = DynamicDecode(
+            decoder, max_step_num=max_step_num, is_test=True)
+
+    @staticmethod
+    def model_forward(self, init_hidden, init_cell):
+        return self.beam_search_decoder([init_hidden, init_cell])[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[0].shape[-1]],
+                "float32",
+                name="init_hidden"), Input(
+                    [None, self.inputs[1].shape[-1]],
+                    "float32",
+                    name="init_cell")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerEncoder(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            # encoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   ffn_fc1_act="relu"):
+        self.encoder = TransformerEncoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd, ffn_fc1_act)
+
+    @staticmethod
+    def model_forward(self, enc_input, attn_bias):
+        return self.encoder(enc_input, attn_bias)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_input"), Input(
+                    [None, self.inputs[1].shape[1], None, None],
+                    "float32",
+                    name="attn_bias")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerDecoder(TestTransformerEncoder):
+    def setUp(self):
+        self.inputs = [
+            # decoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9,
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da"):
+        self.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+
+    @staticmethod
+    def model_forward(self,
+                      dec_input,
+                      enc_output,
+                      self_attn_bias,
+                      cross_attn_bias,
+                      caches=None):
+        return self.decoder(dec_input, enc_output, self_attn_bias,
+                            cross_attn_bias, caches)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="dec_input"), Input(
+                    [None, None, self.inputs[0].shape[-1]],
+                    "float32",
+                    name="enc_output"), Input(
+                        [None, self.inputs[-1].shape[1], None, None],
+                        "float32",
+                        name="self_attn_bias"), Input(
+                            [None, self.inputs[-1].shape[1], None, None],
+                            "float32",
+                            name="cross_attn_bias")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerBeamSearchDecoder(ModuleApiTest):
+    def setUp(self):
+        shape = (8, 32)
+        self.inputs = [
+            np.random.random(shape).astype("float32"),
+            np.random.random(shape).astype("float32")
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "embed_dim": 32,
+            "hidden_size": 32,
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        embedder = Embedding(size=[vocab_size, d_model])
+        output_layer = Linear(d_model, vocab_size)
+        decoder = TransformerDecoder(n_layer, n_head, d_key, d_value, d_model,
+                                     d_inner_hid, prepostprocess_dropout,
+                                     attention_dropout, relu_dropout,
+                                     preprocess_cmd, postprocess_cmd)
+        transformer_cell = TransformerCell(decoder)
+        self.beam_search_decoder = DynamicDecode(
+            TransformerBeamSearchDecoder(
+                transformer_cell,
+                bos_id,
+                eos_id,
+                beam_size,
+                var_dim_in_state=2),
+            max_step_num,
+            is_test=True)
+
+    @staticmethod
+    def model_forward(self, enc_output, trg_src_attn_bias):
+        caches = [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+        enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            enc_output, self.beam_size)
+        trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            trg_src_attn_bias, self.beam_size)
+        static_caches = self.decoder.decoder.prepare_static_cache(enc_output)
+        rs, _ = self.beam_search_decoder(
+            inits=caches,
+            enc_output=enc_output,
+            trg_src_attn_bias=trg_src_attn_bias,
+            static_caches=static_caches)
+        return rs
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[0].shape[-1]],
+                "float32",
+                name="init_hidden"), Input(
+                    [None, self.inputs[1].shape[-1]],
+                    "float32",
+                    name="init_cell")
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/hapi/text/text.py b/hapi/text/text.py
index ed803ae..0a382cd 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -48,8 +48,8 @@
     'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
     'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
     'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
-    'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
+    'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
+    'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
 ]
 
 
@@ -1002,7 +1002,7 @@ def _maybe_copy(state, new_state, step_mask):
                 **kwargs)
 
 
-class TransfomerCell(object):
+class TransformerCell(Layer):
     """
     Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
     used as RNNCell

From eb20b652658a256c5bb6f026bebc2f66b44c11bf Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 30 Apr 2020 18:15:00 +0800
Subject: [PATCH 02/16] Add more unit tests for apis in text.py. Rename some
 apis in text.py.

---
 examples/sentiment_classification/models.py |  62 +++++----
 examples/transformer/transformer.py         |  23 +++-
 hapi/tests/test_text.py                     | 140 +++++++++++++-------
 hapi/text/__init__.py                       |   4 +-
 hapi/text/text.py                           |  71 +++++++---
 5 files changed, 205 insertions(+), 95 deletions(-)

diff --git a/examples/sentiment_classification/models.py b/examples/sentiment_classification/models.py
index 313b928..1816ba4 100644
--- a/examples/sentiment_classification/models.py
+++ b/examples/sentiment_classification/models.py
@@ -16,12 +16,12 @@
 from paddle.fluid.dygraph.base import to_variable
 import numpy as np
 from hapi.model import Model
-from hapi.text.text import GRUEncoderLayer as BiGRUEncoder
+from hapi.text.text import GRUEncoder as BiGRUEncoder
 from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder
 
 
 class CNN(Model):
-    def __init__(self,  dict_dim, batch_size, seq_len):
+    def __init__(self, dict_dim, batch_size, seq_len):
         super(CNN, self).__init__()
         self.dict_dim = dict_dim
         self.emb_dim = 128
@@ -36,15 +36,19 @@ def __init__(self,  dict_dim, batch_size, seq_len):
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
             seq_len=self.seq_len,
-            filter_size= self.win_size,
-            num_filters= self.hid_dim,
-            hidden_dim= self.hid_dim,
+            filter_size=self.win_size,
+            num_filters=self.hid_dim,
+            hidden_dim=self.hid_dim,
             padding_idx=None,
             act='tanh')
-        self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim * self.seq_len,
+            output_dim=self.fc_hid_dim,
+            act="softmax")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         conv_3 = self._encoder(inputs)
@@ -69,11 +73,14 @@ def __init__(self, dict_dim, batch_size, seq_len):
             padding_idx=None,
             bow_dim=self.hid_dim,
             seq_len=self.seq_len)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh")
-        self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         bow_1 = self._encoder(inputs)
@@ -94,10 +101,12 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = GRUEncoder(
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
@@ -112,7 +121,7 @@ def forward(self, inputs):
         prediction = self._fc_prediction(fc_1)
         return prediction
 
-        
+
 class BiGRU(Model):
     def __init__(self, dict_dim, batch_size, seq_len):
         super(BiGRU, self).__init__()
@@ -130,11 +139,13 @@ def __init__(self, dict_dim, batch_size, seq_len):
             is_sparse=False)
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3)
-        self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = BiGRUEncoder(
             grnn_hidden_dim=self.hid_dim,
             input_dim=self.hid_dim * 3,
@@ -144,7 +155,8 @@ def __init__(self, dict_dim, batch_size, seq_len):
 
     def forward(self, inputs):
         emb = self.embedding(inputs)
-        emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim])
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
         encoded_vector = self._encoder(fc_1)
         encoded_vector = fluid.layers.tanh(encoded_vector)
diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py
index b2ec120..179dc17 100644
--- a/examples/transformer/transformer.py
+++ b/examples/transformer/transformer.py
@@ -21,7 +21,7 @@
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from hapi.model import Model, CrossEntropy, Loss
-from hapi.text import TransformerCell, TransformerBeamSearchDecoder, DynamicDecode
+from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -606,6 +606,27 @@ def forward(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
         return predict
 
 
+class TransformerCell(Layer):
+    """
+    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
+    used as RNNCell
+    """
+
+    def __init__(self, decoder):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
+        trg_word, trg_pos = inputs
+        for cache, static_cache in zip(states, static_caches):
+            cache.update(static_cache)
+        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
+                              enc_output, states)
+        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        return logits, new_states
+
+
 class InferTransformer(Transformer):
     """
     model for prediction
diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index 46efbf6..eca5fda 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -25,8 +25,8 @@
 from paddle.fluid.layers import BeamSearchDecoder
 import hapi.text as text
 from hapi.model import Model, Input, set_device
-from hapi.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder
-from hapi.text import *
+# from hapi.text.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder, TransformerCell
+from hapi.text.text import *
 
 
 def sigmoid(x):
@@ -187,7 +187,7 @@ def make_inputs(self):
             Input(
                 [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="input")
+                name="input"),
         ]
         return inputs
 
@@ -216,7 +216,7 @@ def make_inputs(self):
             Input(
                 [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="input")
+                name="input"),
         ]
         return inputs
 
@@ -270,10 +270,9 @@ def make_inputs(self):
             Input(
                 [None, self.inputs[0].shape[-1]],
                 "float32",
-                name="init_hidden"), Input(
-                    [None, self.inputs[1].shape[-1]],
-                    "float32",
-                    name="init_cell")
+                name="init_hidden"),
+            Input(
+                [None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
         ]
         return inputs
 
@@ -328,10 +327,11 @@ def make_inputs(self):
             Input(
                 [None, None, self.inputs[0].shape[-1]],
                 "float32",
-                name="enc_input"), Input(
-                    [None, self.inputs[1].shape[1], None, None],
-                    "float32",
-                    name="attn_bias")
+                name="enc_input"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="attn_bias"),
         ]
         return inputs
 
@@ -395,16 +395,19 @@ def make_inputs(self):
             Input(
                 [None, None, self.inputs[0].shape[-1]],
                 "float32",
-                name="dec_input"), Input(
-                    [None, None, self.inputs[0].shape[-1]],
-                    "float32",
-                    name="enc_output"), Input(
-                        [None, self.inputs[-1].shape[1], None, None],
-                        "float32",
-                        name="self_attn_bias"), Input(
-                            [None, self.inputs[-1].shape[1], None, None],
-                            "float32",
-                            name="cross_attn_bias")
+                name="dec_input"),
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="self_attn_bias"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="cross_attn_bias"),
         ]
         return inputs
 
@@ -414,16 +417,21 @@ def test_check_output(self):
 
 class TestTransformerBeamSearchDecoder(ModuleApiTest):
     def setUp(self):
-        shape = (8, 32)
         self.inputs = [
-            np.random.random(shape).astype("float32"),
-            np.random.random(shape).astype("float32")
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 128]).astype("float32"),
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
         ]
         self.outputs = None
         self.attrs = {
             "vocab_size": 100,
-            "embed_dim": 32,
-            "hidden_size": 32,
+            "n_layer": 2,
+            "n_head": 2,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 128,
+            "d_inner_hid": 128
         }
         self.param_states = {}
 
@@ -445,13 +453,24 @@ def model_init(self,
                    eos_id=1,
                    beam_size=4,
                    max_step_num=20):
-        embedder = Embedding(size=[vocab_size, d_model])
+        self.beam_size = beam_size
+
+        def embeder_init(self, size):
+            Layer.__init__(self)
+            self.embedder = Embedding(size)
+
+        Embedder = type("Embedder", (Layer, ), {
+            "__init__": embeder_init,
+            "forward": lambda self, word, pos: self.embedder(word)
+        })
+        embedder = Embedder(size=[vocab_size, d_model])
         output_layer = Linear(d_model, vocab_size)
-        decoder = TransformerDecoder(n_layer, n_head, d_key, d_value, d_model,
-                                     d_inner_hid, prepostprocess_dropout,
-                                     attention_dropout, relu_dropout,
-                                     preprocess_cmd, postprocess_cmd)
-        transformer_cell = TransformerCell(decoder)
+        self.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+        transformer_cell = TransformerCell(self.decoder, embedder,
+                                           output_layer)
         self.beam_search_decoder = DynamicDecode(
             TransformerBeamSearchDecoder(
                 transformer_cell,
@@ -464,23 +483,12 @@ def model_init(self,
 
     @staticmethod
     def model_forward(self, enc_output, trg_src_attn_bias):
-        caches = [{
-            "k": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_key],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_value],
-                dtype=enc_output.dtype,
-                value=0),
-        } for i in range(self.n_layer)]
+        caches = self.decoder.prepare_incremental_cache(enc_output)
         enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
             enc_output, self.beam_size)
         trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
             trg_src_attn_bias, self.beam_size)
-        static_caches = self.decoder.decoder.prepare_static_cache(enc_output)
+        static_caches = self.decoder.prepare_static_cache(enc_output)
         rs, _ = self.beam_search_decoder(
             inits=caches,
             enc_output=enc_output,
@@ -491,12 +499,42 @@ def model_forward(self, enc_output, trg_src_attn_bias):
     def make_inputs(self):
         inputs = [
             Input(
-                [None, self.inputs[0].shape[-1]],
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="trg_src_attn_bias"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceTagging(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 128)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size):
+        self.module = SequenceTagging(input_size, hidden_size)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.gru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="init_hidden"), Input(
-                    [None, self.inputs[1].shape[-1]],
-                    "float32",
-                    name="init_cell")
+                name="input"),
         ]
         return inputs
 
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
index 2177ada..890e989 100644
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
@@ -28,6 +28,6 @@
 from hapi.text.text import GRUCell as GRUCell
 from hapi.text.text import GRUEncoderCell as GRUEncoderCell
 from hapi.text.text import BiGRU as BiGRU
-from hapi.text.text import Linear_chain_crf as Linear_chain_crf
-from hapi.text.text import Crf_decoding as Crf_decoding
+from hapi.text.text import LinearChainCRF as LinearChainCRF
+from hapi.text.text import CRFDecoding as CRFDecoding
 from hapi.text.text import SequenceTagging as SequenceTagging
diff --git a/hapi/text/text.py b/hapi/text/text.py
index 0a382cd..8332700 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -49,7 +49,7 @@
     'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
     'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
     'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
-    'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
+    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder'
 ]
 
 
@@ -1008,18 +1008,38 @@ class TransformerCell(Layer):
     used as RNNCell
     """
 
-    def __init__(self, decoder):
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
         self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
 
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
         trg_word, trg_pos = inputs
         for cache, static_cache in zip(states, static_caches):
             cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+        if len(outputs.shape) == 3:
+            # squeeze to adapt to BeamSearchDecoder which use 2D logits 
+            outputs = layers.squeeze(outputs, [1])
         new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        return [{
+            "k": [self.n_head, 0, self.d_key],
+            "v": [self.n_head, 0, self.d_value],
+        } for i in range(len(self.n_layer))]
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
@@ -1521,6 +1541,11 @@ def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
                  preprocess_cmd, postprocess_cmd):
         super(TransformerDecoder, self).__init__()
 
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+
         self.decoder_layers = list()
         for i in range(n_layer):
             self.decoder_layers.append(
@@ -1555,6 +1580,20 @@ def prepare_static_cache(self, enc_output):
             for decoder_layer in self.decoder_layers
         ]
 
+    def prepare_incremental_cache(self, enc_output):
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
 
 #TODO: we should merge GRUCell with BasicGRUCell
 class GRUCell(RNNCell):
@@ -1651,9 +1690,9 @@ def forward(self, input_feature):
         return bi_merge
 
 
-class Linear_chain_crf(fluid.dygraph.Layer):
+class LinearChainCRF(Layer):
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
+        super(LinearChainCRF, self).__init__()
 
         self._param_attr = param_attr
         self._dtype = dtype
@@ -1702,9 +1741,9 @@ def forward(self, input, label, length=None):
         return log_likelihood
 
 
-class Crf_decoding(fluid.dygraph.Layer):
+class CRFDecoding(Layer):
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
+        super(CRFDecoding, self).__init__()
 
         self._dtype = dtype
         self._size = size
@@ -1742,7 +1781,7 @@ def forward(self, input, label=None, length=None):
         return viterbi_path
 
 
-class GRUEncoderLayer(Layer):
+class GRUEncoder(Layer):
     def __init__(self,
                  input_dim,
                  grnn_hidden_dim,
@@ -1750,7 +1789,7 @@ def __init__(self,
                  num_layers=1,
                  h_0=None,
                  is_bidirection=False):
-        super(GRUEncoderLayer, self).__init__()
+        super(GRUEncoder, self).__init__()
         self.h_0 = h_0
         self.num_layers = num_layers
         self.is_bidirection = is_bidirection
@@ -1849,7 +1888,7 @@ def __init__(self,
             force_cpu=True,
             name='h_0')
 
-        self.gru_encoder = GRUEncoderLayer(
+        self.gru_encoder = GRUEncoder(
             input_dim=self.grnn_hidden_dim,
             grnn_hidden_dim=self.grnn_hidden_dim,
             init_bound=self.init_bound,
@@ -1866,12 +1905,12 @@ def __init__(self,
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
-        self.linear_chain_crf = Linear_chain_crf(
+        self.linear_chain_crf = LinearChainCRF(
             param_attr=fluid.ParamAttr(
                 name='linear_chain_crfw', learning_rate=self.crf_lr),
             size=self.num_labels)
 
-        self.crf_decoding = Crf_decoding(
+        self.crf_decoding = CRFDecoding(
             param_attr=fluid.ParamAttr(
                 name='crfw', learning_rate=self.crf_lr),
             size=self.num_labels)

From d88cbf7543cfc43ee0a935b58a10bbbb953d0b71 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 7 May 2020 21:23:55 +0800
Subject: [PATCH 03/16] Add StackedRNN and BiRNN.

---
 hapi/tests/test_text.py | 179 +++++++++++++++-
 hapi/text/text.py       | 465 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 613 insertions(+), 31 deletions(-)

diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index eca5fda..6f0d014 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -25,7 +25,6 @@
 from paddle.fluid.layers import BeamSearchDecoder
 import hapi.text as text
 from hapi.model import Model, Input, set_device
-# from hapi.text.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder, TransformerCell
 from hapi.text.text import *
 
 
@@ -515,15 +514,142 @@ def test_check_output(self):
 
 class TestSequenceTagging(ModuleApiTest):
     def setUp(self):
-        shape = (2, 4, 128)
+        self.inputs = [
+            np.random.randint(0, 100, (2, 8)).astype("int64"),
+            np.random.randint(1, 8, (2)).astype("int64"),
+            np.random.randint(0, 5, (2, 8)).astype("int64")
+        ]
+        self.outputs = None
+        self.attrs = {"vocab_size": 100, "num_labels": 5}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   num_labels,
+                   word_emb_dim=128,
+                   grnn_hidden_dim=128,
+                   emb_learning_rate=0.1,
+                   crf_learning_rate=0.1,
+                   bigru_num=2,
+                   init_bound=0.1):
+        self.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim,
+                                      grnn_hidden_dim, emb_learning_rate,
+                                      crf_learning_rate, bigru_num, init_bound)
+
+    @staticmethod
+    def model_forward(self, word, lengths, target=None):
+        return self.tagger(word, lengths, target)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None], "int64", name="word"),
+            Input(
+                [None], "int64", name="lengths"),
+            Input(
+                [None, None], "int64", name="target"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceTaggingInfer(TestSequenceTagging):
+    def setUp(self):
+        super(TestSequenceTaggingInfer, self).setUp()
+        self.inputs = self.inputs[:2]  # remove target
+
+    def make_inputs(self):
+        inputs = super(TestSequenceTaggingInfer,
+                       self).make_inputs()[:2]  # remove target
+        return inputs
+
+
+class TestLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
         self.inputs = [np.random.random(shape).astype("float32")]
         self.outputs = None
-        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size):
-        self.module = SequenceTagging(input_size, hidden_size)
+    def model_init(self, input_size, hidden_size, num_layers):
+        self.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBiLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        self.bilstm = BidirectionalLSTM(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.bilstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output_merge0(self):
+        self.check_output()
+
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+
+
+class TestGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size, num_layers):
+        self.gru = GRU(input_size, hidden_size, num_layers=num_layers)
 
     @staticmethod
     def model_forward(self, inputs):
@@ -542,5 +668,48 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestBiGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        self.bigru = BidirectionalGRU(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.bigru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output_merge0(self):
+        self.check_output()
+
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/hapi/text/text.py b/hapi/text/text.py
index 8332700..b5a0cf5 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -49,7 +49,9 @@
     'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
     'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
     'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
-    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder'
+    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder',
+    'StackedLSTMCell', 'LSTM', 'BidirectionalLSTM', 'StackedGRUCell', 'GRU',
+    'BidirectionalGRU'
 ]
 
 
@@ -241,7 +243,7 @@ def __init__(self,
         # TODO(guosheng): find better way to resolve constants in __init__
         self._forget_bias = layers.create_global_var(
             shape=[1], dtype=dtype, value=forget_bias, persistable=True)
-        self._forget_bias.stop_gradient = False
+        self._forget_bias.stop_gradient = True
         self._dtype = dtype
         self._input_size = input_size
 
@@ -468,9 +470,11 @@ def forward(self, input, state):
         new_cell = layers.elementwise_add(
             layers.elementwise_mul(
                 pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
+                self._gate_activation(
+                    layers.elementwise_add(f, self._forget_bias))),
+            layers.elementwise_mul(
+                self._gate_activation(i), self._activation(j)))
+        new_hidden = self._activation(new_cell) * self._gate_activation(o)
 
         return new_hidden, [new_hidden, new_cell]
 
@@ -1029,7 +1033,7 @@ def forward(self, inputs, states, trg_src_attn_bias, enc_output,
         if self.output_fn is not None:
             outputs = self.output_fn(outputs)
         if len(outputs.shape) == 3:
-            # squeeze to adapt to BeamSearchDecoder which use 2D logits 
+            # squeeze to adapt to BeamSearchDecoder which use 2D logits
             outputs = layers.squeeze(outputs, [1])
         new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
         return outputs, new_states
@@ -1037,9 +1041,9 @@ def forward(self, inputs, states, trg_src_attn_bias, enc_output,
     @property
     def state_shape(self):
         return [{
-            "k": [self.n_head, 0, self.d_key],
-            "v": [self.n_head, 0, self.d_value],
-        } for i in range(len(self.n_layer))]
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(len(self.decoder.n_layer))]
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
@@ -1787,10 +1791,8 @@ def __init__(self,
                  grnn_hidden_dim,
                  init_bound,
                  num_layers=1,
-                 h_0=None,
                  is_bidirection=False):
         super(GRUEncoder, self).__init__()
-        self.h_0 = h_0
         self.num_layers = num_layers
         self.is_bidirection = is_bidirection
         self.gru_list = []
@@ -1827,7 +1829,7 @@ def __init__(self,
                             is_reverse=True,
                             time_major=False)))
 
-    def forward(self, input_feature):
+    def forward(self, input_feature, h0=None):
         for i in range(self.num_layers):
             pre_gru, pre_state = self.gru_list[i](input_feature)
             if self.is_bidirection:
@@ -1839,18 +1841,16 @@ def forward(self, input_feature):
         return out
 
 
-class SequenceTagging(fluid.dygraph.Layer):
+class SequenceTagging(Layer):
     def __init__(self,
                  vocab_size,
                  num_labels,
-                 batch_size,
                  word_emb_dim=128,
                  grnn_hidden_dim=128,
                  emb_learning_rate=0.1,
                  crf_learning_rate=0.1,
                  bigru_num=2,
-                 init_bound=0.1,
-                 length=None):
+                 init_bound=0.1):
         super(SequenceTagging, self).__init__()
         """
         define the sequence tagging network structure
@@ -1868,7 +1868,6 @@ def __init__(self,
         self.emb_lr = emb_learning_rate
         self.crf_lr = crf_learning_rate
         self.bigru_num = bigru_num
-        self.batch_size = batch_size
         self.init_bound = 0.1
 
         self.word_embedding = Embedding(
@@ -1880,20 +1879,11 @@ def __init__(self,
                 initializer=fluid.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound)))
 
-        h_0 = fluid.layers.create_global_var(
-            shape=[self.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-
         self.gru_encoder = GRUEncoder(
             input_dim=self.grnn_hidden_dim,
             grnn_hidden_dim=self.grnn_hidden_dim,
             init_bound=self.init_bound,
             num_layers=self.bigru_num,
-            h_0=h_0,
             is_bidirection=True)
 
         self.fc = Linear(
@@ -1936,3 +1926,426 @@ def forward(self, word, lengths, target=None):
             self.linear_chain_crf.weight = self.crf_decoding.weight
             crf_decode = self.crf_decoding(input=emission, length=lengths)
             return crf_decode, lengths
+
+
+class StackedRNNCell(RNNCell):
+    def __init__(self, cells):
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+
+    def forward(self, inputs, states):
+        pass
+
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" %
+                n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
+
+    @property
+    def state_shape(self):
+        return [cell.state_shape for cell in self.cells]
+
+
+class StackedLSTMCell(RNNCell):
+    """
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size))
+                        .astype(dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, step_input, states):
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            out, new_state = cell(step_input, states[i])
+            step_input = layers.dropout(
+                out,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else out
+            new_states.append(new_state)
+        return step_input, new_states
+
+    @property
+    def state_shape(self):
+        return [cell.state_shape for cell in self.cells]
+
+
+class LSTM(Layer):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        return self.lstm(inputs, initial_states, sequence_length)
+
+
+class BidirectionalRNN(Layer):
+    def __init__(self,
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
+        else:
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length)
+        outputs = map_structure(
+            self.merge_func, outputs_fw,
+            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
+        return outputs, (states_fw, states_bw)
+
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
+
+
+class BidirectionalLSTM(Layer):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[0], bias_attrs[0],
+                                      dtype)
+            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[1], bias_attrs[1],
+                                      dtype)
+            self.lstm = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+
+            # maybe design cell including both forward and backward later
+            self.lstm = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
+                self.lstm.append(
+                    self.add_sublayer(
+                        "lstm_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        if not self.merge_each_layer:
+            return self.lstm(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.lstm[i](inputs, initial_states[i],
+                                               sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+
+
+class StackedGRUCell(RNNCell):
+    """
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    BasicGRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, step_input, states):
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            out, new_state = cell(step_input, states[i])
+            step_input = layers.dropout(
+                out,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else out
+            new_states.append(new_state)
+        return step_input, new_states
+
+    @property
+    def state_shape(self):
+        return [cell.state_shape for cell in self.cells]
+
+
+class GRU(Layer):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRU, self).__init__()
+        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                  activation, num_layers, dropout, param_attr,
+                                  bias_attr, dtype)
+        self.gru = RNN(gru_cell, is_reverse, time_major)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        return self.gru(inputs, initial_states, sequence_length)
+
+
+class BidirectionalGRU(Layer):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalGRU, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[0], bias_attrs[0], dtype)
+            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[1], bias_attrs[1], dtype)
+            self.gru = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+
+            # maybe design cell including both forward and backward later
+            self.gru = []
+            for i in range(num_layers):
+                cell_fw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, fw_param_attrs[i],
+                                         fw_bias_attrs[i], dtype)
+                cell_bw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, bw_param_attrs[i],
+                                         bw_bias_attrs[i], dtype)
+                self.gru.append(
+                    self.add_sublayer(
+                        "gru_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        if not self.merge_each_layer:
+            return self.gru(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.gru[i](inputs, initial_states[i],
+                                              sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states

From f12434620404fed71daf2738cf1de09fcf03ad6d Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Sat, 9 May 2020 14:20:21 +0800
Subject: [PATCH 04/16] Add api docs for RNN related apis.

---
 hapi/text/text.py | 3651 ++++++++++++++++++++++++++++-----------------
 1 file changed, 2283 insertions(+), 1368 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index b5a0cf5..5327bbd 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -45,17 +45,40 @@
 from paddle.fluid.layers import BeamSearchDecoder
 
 __all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
-    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder',
-    'StackedLSTMCell', 'LSTM', 'BidirectionalLSTM', 'StackedGRUCell', 'GRU',
-    'BidirectionalGRU'
+    'RNNCell',
+    'BasicLSTMCell',
+    'BasicGRUCell',
+    'RNN',
+    'StackedLSTMCell',
+    'LSTM',
+    'BidirectionalLSTM',
+    'StackedGRUCell',
+    'GRU',
+    'BidirectionalGRU',
+    'DynamicDecode',
+    'BeamSearchDecoder',
+    'MultiHeadAttention',
+    'FFN',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'TransformerCell',
+    'TransformerBeamSearchDecoder',
+    'LinearChainCRF',
+    'CRFDecoding',
+    'SequenceTagging',
+    'GRUEncoder',
 ]
 
 
 class RNNCell(Layer):
+    """
+    RNNCell is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
     def get_initial_states(self,
                            batch_ref,
                            shape=None,
@@ -70,16 +93,18 @@ def get_initial_states(self,
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
+            shape: A (possibly nested structure of) shape[s], where a shape is
                 represented as a list/tuple of integer). -1(for batch size) will
                 beautomatically inserted if shape is not started with it. If None,
                 property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
+            dtype: A (possibly nested structure of) data type[s]. The structure
                 must be same as that of `shape`, except when all tensors' in states
                 has the same data type, a single data type can be used. If None and
                 property `cell.state_shape` is not available, float32 will be used
                 as the data type. The default value is None.
             init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
 
         Returns:
             Variable: tensor variable[s] packed in the same structure provided \
@@ -170,46 +195,61 @@ def state_dtype(self):
 
 class BasicLSTMCell(RNNCell):
     """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
+    Long-Short Term Memory(LSTM) RNN cell.
+
+    The formula used is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
+    for more details.
+
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias(float, optional): forget bias used when computing forget gate.
+            Default 1.0
+        dtype(string, optional): The data type used in this cell. Default float32.
+        forget_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
+            as keys, and the corresponding values should be instances of Parameter
+            which represent :math:`W_{x_{f}}, W_{h_{f}}, b_{f}` and have shape
+            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
+            separately. It is used for reusing and sharing weights when provided,
+            otherwise create these parameters. Note that parameters from input
+            gate, forget gate and cell would be concatenated in implementation.
+        input_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{i}}, W_{h_{i}}, b_{i}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
+        output_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{o}}, W_{h_{o}}, b_{o}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
+        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{x_{c}}, W_{h_{c}}, b_{c}` separately. It has the
+            same usage as :attr:`forget_gate_weights`.
     """
 
     def __init__(self,
@@ -480,41 +520,63 @@ def forward(self, input, state):
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
         return [[self._hidden_size], [self._hidden_size]]
 
 
 class BasicGRUCell(RNNCell):
     """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
+    Gated Recurrent Unit (GRU) RNN cell.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
+    for more details.
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        dtype(string, optional): The data type used in this cell. Default float32.
+        update_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
+            as keys, and the corresponding values should be instances of Parameter
+            which represent :math:`W_{ux}, W_{uh}, b_{u}` and have shape
+            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
+            separately. It is used for reusing and sharing weights when provided,
+            otherwise create these parameters. Note that parameters from update
+            gate and reset gate would be concatenated in implementation.
+        reset_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{rx}, W_{rh}, b_{r}` separately. It has the
+            same usage as :attr:`update_gate_weights`.
+        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
+            and the corresponding values should be instances of Parameter which
+            represent :math:`W_{cx}, W_{ch}, b_{c}`` separately. It has the
+            same usage as :attr:`update_gate_weights`.
     """
 
     def __init__(self,
@@ -678,7 +740,7 @@ def __init__(self,
 
             if "b" in reset_gate_weights and reset_gate_weights[
                     "b"] is not None:
-                self.rg_b = reused_params["b"]
+                self.rg_b = reset_gate_weights["b"]
             else:
                 if gate_bias_attr is not None and gate_bias_attr.name is not None:
                     tmp_param_attr = copy.deepcopy(gate_bias_attr)
@@ -771,10 +833,44 @@ def forward(self, input, state):
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
         return [self._hidden_size]
 
 
-class RNN(fluid.dygraph.Layer):
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
     def __init__(self, cell, is_reverse=False, time_major=False):
         super(RNN, self).__init__()
         self.cell = cell
@@ -790,6 +886,38 @@ def forward(self,
                 initial_states=None,
                 sequence_length=None,
                 **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
         if fluid.in_dygraph_mode():
 
             class ArrayWrapper(object):
@@ -878,1474 +1006,2261 @@ def _maybe_copy(state, new_state, step_mask):
         return final_outputs, final_states
 
 
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
+class StackedRNNCell(RNNCell):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
 
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
 
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
+    Examples:
 
-                def append(self, x):
-                    self.array.append(x)
-                    return self
+        .. code-block:: python
 
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
 
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
+            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
 
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
+    def __init__(self, cells):
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
 
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
 
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
 
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
 
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
 
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
 
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" %
+                n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
 
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
 
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
 
-class TransformerCell(Layer):
+class StackedLSTMCell(RNNCell):
     """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
-    def __init__(self, decoder, embedding_fn=None, output_fn=None):
-        super(TransformerCell, self).__init__()
-        self.decoder = decoder
-        self.embedding_fn = embedding_fn
-        self.output_fn = output_fn
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
-                static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        if self.embedding_fn is not None:
-            dec_input = self.embedding_fn(trg_word, trg_pos)
-            outputs = self.decoder(dec_input, enc_output, None,
-                                   trg_src_attn_bias, states)
-        else:
-            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
-                                   trg_src_attn_bias, states)
-        if self.output_fn is not None:
-            outputs = self.output_fn(outputs)
-        if len(outputs.shape) == 3:
-            # squeeze to adapt to BeamSearchDecoder which use 2D logits
-            outputs = layers.squeeze(outputs, [1])
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size))
+                        .astype(dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
         return outputs, new_states
 
     @property
     def state_shape(self):
-        return [{
-            "k": [self.decoder.n_head, 0, self.decoder.d_key],
-            "v": [self.decoder.n_head, 0, self.decoder.d_value],
-        } for i in range(len(self.decoder.n_layer))]
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
 
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
 
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
+class LSTM(Layer):
     """
-    PrePostProcessLayer
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 process_cmd,
-                 d_model,
-                 dropout_rate,
-                 reused_layer_norm=None):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(
-                    lambda x, y: x + y if y is not None else x)
-            elif cmd == "n":  # add layer normalization
-                if reused_layer_norm is not None:
-                    layer_norm = reused_layer_norm
-                else:
-                    layer_norm = LayerNorm(
-                        normalized_shape=d_model,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.)),
-                        bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.)))
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
 
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        layer_norm))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
 
-    def forward(self, x, residual=None):
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        return self.lstm(inputs, initial_states, sequence_length)
 
 
-class MultiHeadAttention(Layer):
+class BidirectionalRNN(Layer):
     """
-    Multi-Head Attention
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.0,
-                 reused_query_fc=None,
-                 reused_key_fc=None,
-                 reused_value_fc=None,
-                 reused_proj_fc=None):
-
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-
-        if reused_query_fc is not None:
-            self.q_fc = reused_query_fc
-        else:
-            self.q_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_key_fc is not None:
-            self.k_fc = reused_key_fc
-        else:
-            self.k_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_value_fc is not None:
-            self.v_fc = reused_value_fc
-        else:
-            self.v_fc = Linear(
-                input_dim=d_model,
-                output_dim=d_value * n_head,
-                bias_attr=False)
-        if reused_proj_fc is not None:
-            self.proj_fc = reused_proj_fc
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
         else:
-            self.proj_fc = Linear(
-                input_dim=d_value * n_head,
-                output_dim=d_model,
-                bias_attr=False)
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
 
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
 
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
 
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`, `final_states` is a pair including `final_states` \
+                of forward and backward RNN.
+        """
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
         else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(
+            self.merge_func, outputs_fw,
+            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
+        return outputs, (states_fw, states_bw)
 
-        return q, k, v
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
 
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
 
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias is not None:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
 
 
-class FFN(Layer):
+class BidirectionalLSTM(Layer):
     """
-    Feed-Forward Network
+    Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
+    input sequence. 
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalLSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_lstm(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 d_inner_hid,
-                 d_model,
-                 dropout_rate,
-                 fc1_act="relu",
-                 reused_fc1=None,
-                 reused_fc2=None):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        if reused_fc1 is not None:
-            self.fc1 = reused_fc1
-        else:
-            self.fc1 = Linear(
-                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        if reused_fc2 is not None:
-            self.fc2 = reused_fc2
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[0], bias_attrs[0],
+                                      dtype)
+            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[1], bias_attrs[1],
+                                      dtype)
+            self.lstm = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
         else:
-            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
 
-    def forward(self, x):
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
+            # maybe design cell including both forward and backward later
+            self.lstm = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
+                self.lstm.append(
+                    self.add_sublayer(
+                        "lstm_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
 
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
 
-class TransformerEncoderLayer(Layer):
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional LSTM; `final_states` is a \
+                pair including `final_states` of forward and backward LSTM when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional LSTM, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.lstm(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.lstm[i](inputs, initial_states[i],
+                                               sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+
+
+class StackedGRUCell(RNNCell):
     """
-    EncoderLayer
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell, inputs=inputs)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu",
-                 reused_pre_selatt_layernorm=None,
-                 reused_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-        super(TransformerEncoderLayer, self).__init__()
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    BasicGRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
 
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
-            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
-            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+    def forward(self, inputs, states):
+        """
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
 
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       fc1_act=ffn_fc1_act,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
 
-    def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
 
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
 
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
-class TransformerEncoder(Layer):
+
+class GRU(Layer):
     """
-    encoder
+    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = GRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = gru(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-
-        super(TransformerEncoder, self).__init__()
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRU, self).__init__()
+        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                  activation, num_layers, dropout, param_attr,
+                                  bias_attr, dtype)
+        self.gru = RNN(gru_cell, is_reverse, time_major)
 
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        ffn_fc1_act=ffn_fc1_act)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one.
 
-    def forward(self, enc_input, attn_bias):
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU, and the initial states of each GRU is a tensor
+                shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
 
-        return self.processer(enc_output)
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last GRU and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types.
+        """
+        return self.gru(inputs, initial_states, sequence_length)
 
 
-class TransformerDecoderLayer(Layer):
+class BidirectionalGRU(Layer):
     """
-    decoder
+    Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalGRU
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_gru(inputs)  # [2, 4, 128]
     """
 
     def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 reused_pre_selfatt_layernorm=None,
-                 reused_self_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_crossatt_layernorm=None,
-                 reused_cross_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_crossatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selfatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_self_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_self_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_crossatt_layernorm)
-        self.cross_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_cross_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_cross_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_cross_multihead_att_weights[
-                "reused_proj_fc"])
-        self.postprocesser2 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_crossatt_layernorm)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalGRU, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[0], bias_attrs[0], dtype)
+            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[1], bias_attrs[1], dtype)
+            self.gru = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
 
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+            # maybe design cell including both forward and backward later
+            self.gru = []
+            for i in range(num_layers):
+                cell_fw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, fw_param_attrs[i],
+                                         fw_bias_attrs[i], dtype)
+                cell_bw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, bw_param_attrs[i],
+                                         bw_bias_attrs[i], dtype)
+                self.gru.append(
+                    self.add_sublayer(
+                        "gru_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
 
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
 
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
 
-        return ffn_output
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional GRU; `final_states` is a \
+                pair including `final_states` of forward and backward GRU when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional GRU, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.gru(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.gru[i](inputs, initial_states[i],
+                                              sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
 
 
-class TransformerDecoder(Layer):
+class DynamicDecode(Layer):
     """
-    decoder
+    DynamicDecode integrates an Decoder instance to perform dynamic decoding.
+
+    It performs :code:`decoder.step()` repeatedly until the returned Tensor
+    indicating finished status contains all True values or the number of
+    decoding step reaches to :attr:`max_step_num`.
+
+    :code:`decoder.initialize()` would be called once before the decoding loop.
+    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+    would be called once after the decoding loop.
+
+    Parameters:
+        decoder (Decoder): An instance of `Decoder`.
+        max_step_num (int, optional): The maximum number of steps. If not provided,
+            decode until the decoder is fully done, or in other words, the returned
+            Tensor by :code:`decoder.step()` indicating finished status contains
+            all True. Default `None`.
+        output_time_major (bool, optional): Indicate the data layout of Tensor included
+            in the final outputs(the first returned value of this method). If
+            attr:`False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
+            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
+        impute_finished (bool, optional): If `True`, then states get copied through
+            for batch entries which are marked as finished, which differs with the
+            unfinished using the new states returned by :code:`decoder.step()` and
+            ensures that the final states have the correct values. Otherwise, states
+            wouldn't be copied through when finished. If the returned `final_states`
+            is needed, it should be set as True, which causes some slowdown.
+            Default `False`.
+        is_test (bool, optional): A flag indicating whether to use test mode. In
+            test mode, it is more memory saving. Default `False`.
+        return_length (bool, optional):  A flag indicating whether to return an
+            extra Tensor variable in the output tuple, which stores the actual
+            lengths of all decoded sequences. Default `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            vocab_size, d_model, = 100, 32
+            encoder_output = paddle.rand((2, 4, d_model))
+            trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
+            output_layer = fluid.dygraph.Linear(d_model, vocab_size)
+            cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
+            decoder = BeamSearchDecoder(decoder_cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+            dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
+            outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
     """
 
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
-        super(TransformerDecoder, self).__init__()
-
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
+    def __init__(self,
+                 decoder,
+                 max_step_num=None,
+                 output_time_major=False,
+                 impute_finished=False,
+                 is_test=False,
+                 return_length=False):
+        super(DynamicDecode, self).__init__()
+        self.decoder = decoder
+        self.max_step_num = max_step_num
+        self.output_time_major = output_time_major
+        self.impute_finished = impute_finished
+        self.is_test = is_test
+        self.return_length = return_length
 
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
+    def forward(self, inits=None, **kwargs):
+        """
+        Performs :code:`decoder.step()` repeatedly until the returned Tensor
+        indicating finished status contains all True values or the number of
+        decoding step reaches to :attr:`max_step_num`.
 
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                caches=None):
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
-            dec_input = dec_output
+        :code:`decoder.initialize()` would be called once before the decoding loop.
+        If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+        would be called once after the decoding loop.
 
-        return self.processer(dec_output)
+        Parameters:
+            inits (object, optional): Argument passed to `decoder.initialize`.
+                Default `None`.
+            **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
 
-    def prepare_static_cache(self, enc_output):
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
+                when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
+                The final outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as the :code:`outputs` \
+                returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
+                is the stacked of all decoding steps' outputs, which might be revised \
+                by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
+                `final_states` is the counterpart at last time step of initial states \
+                returned by :code:`decoder.initialize()` , thus has the same structure \
+                with it and has tensors with same shapes and data types. `sequence_lengths` \
+                is an `int64` tensor with the same shape as `finished` returned \
+                by :code:`decoder.initialize()` , and it stores the actual lengths of \
+                all decoded sequences.
+        """
+        if fluid.in_dygraph_mode():
 
-    def prepare_incremental_cache(self, enc_output):
-        return [{
-            "k": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_key],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_value],
-                dtype=enc_output.dtype,
-                value=0),
-        } for i in range(self.n_layer)]
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
 
+                def append(self, x):
+                    self.array.append(x)
+                    return self
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUCell(RNNCell):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False):
-        super(GRUCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.fc_layer = Linear(
-            input_size, hidden_size * 3, param_attr=param_attr)
+                def __getitem__(self, item):
+                    return self.array.__getitem__(item)
 
-        self.gru_unit = GRUUnit(
-            hidden_size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                state_dtype = state.dtype
+                if convert_dtype(state_dtype) in ["bool"]:
+                    state = layers.cast(state, dtype="float32")
+                    new_state = layers.cast(new_state, dtype="float32")
+                if step_mask.dtype != state.dtype:
+                    step_mask = layers.cast(step_mask, dtype=state.dtype)
+                    # otherwise, renamed bool gradients of would be summed up leading
+                    # to sum(bool) error.
+                    step_mask.stop_gradient = True
+                new_state = layers.elementwise_mul(
+                    state, step_mask, axis=0) - layers.elementwise_mul(
+                        new_state, (step_mask - 1), axis=0)
+                if convert_dtype(state_dtype) in ["bool"]:
+                    new_state = layers.cast(new_state, dtype=state_dtype)
+                return new_state
 
-    def forward(self, inputs, states):
-        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.fc_layer(inputs)
-        hidden, _, _ = self.gru_unit(x, states)
-        return hidden, hidden
+            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
+                inits)
+            inputs, states, finished = (initial_inputs, initial_states,
+                                        initial_finished)
+            cond = layers.logical_not((layers.reduce_all(initial_finished)))
+            sequence_lengths = layers.cast(
+                layers.zeros_like(initial_finished), "int64")
+            outputs = None
 
-    @property
-    def state_shape(self):
-        return [self.hidden_size]
+            step_idx = 0
+            step_idx_tensor = layers.fill_constant(
+                shape=[1], dtype="int64", value=step_idx)
+            while cond.numpy():
+                (step_outputs, next_states, next_inputs,
+                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
+                                                    states, **kwargs)
+                if not self.decoder.tracks_own_finished:
+                    # BeamSearchDecoder would track it own finished, since
+                    # beams would be reordered and the finished status of each
+                    # entry might change. Otherwise, perform logical OR which
+                    # would not change the already finished.
+                    next_finished = layers.logical_or(next_finished, finished)
+                    # To confirm states.finished/finished be consistent with
+                    # next_finished.
+                    layers.assign(next_finished, finished)
+                next_sequence_lengths = layers.elementwise_add(
+                    sequence_lengths,
+                    layers.cast(
+                        layers.logical_not(finished), sequence_lengths.dtype))
 
+                if self.impute_finished:  # rectify the states for the finished.
+                    next_states = map_structure(
+                        lambda x, y: _maybe_copy(x, y, finished), states,
+                        next_states)
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if step_idx == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+                inputs, states, finished, sequence_lengths = (
+                    next_inputs, next_states, next_finished,
+                    next_sequence_lengths)
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUEncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(GRUEncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.gru_cells = []
-        for i in range(num_layers):
-            self.gru_cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    #BasicGRUCell(
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.UniformInitializer(
-                                low=-init_scale, high=init_scale)))))
+                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
+                step_idx += 1
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, gru_cell in enumerate(self.gru_cells):
-            out, state = gru_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout_prob,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout_prob > 0 else out
-            new_states.append(step_input)
-        return step_input, new_states
+                layers.logical_not(layers.reduce_all(finished), cond)
+                if self.max_step_num is not None and step_idx > self.max_step_num:
+                    break
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.gru_cells]
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
+            final_states = states
 
+            try:
+                final_outputs, final_states = self.decoder.finalize(
+                    final_outputs, final_states, sequence_lengths)
+            except NotImplementedError:
+                pass
 
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                      init_bound),
-                       is_reverse=False,
-                       time_major=False)
+            if not self.output_time_major:
+                final_outputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), final_outputs)
 
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                        init_bound),
-                         is_reverse=True,
-                         time_major=False)
+            return (final_outputs, final_states,
+                    sequence_lengths) if self.return_length else (
+                        final_outputs, final_states)
+        else:
+            return fluid.layers.dynamic_decode(
+                self.decoder,
+                inits,
+                max_step_num=self.max_step_num,
+                output_time_major=self.output_time_major,
+                impute_finished=self.impute_finished,
+                is_test=self.is_test,
+                return_length=self.return_length,
+                **kwargs)
 
-    def forward(self, input_feature):
-        pre_gru, pre_state = self.gru(input_feature)
-        gru_r, r_state = self.gru_r(input_feature)
-        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-        return bi_merge
 
+class TransformerCell(Layer):
+    """
+    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
+    used as RNNCell
+    """
 
-class LinearChainCRF(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(LinearChainCRF, self).__init__()
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
 
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
+        trg_word, trg_pos = inputs
+        for cache, static_cache in zip(states, static_caches):
+            cache.update(static_cache)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+        if len(outputs.shape) == 3:
+            # squeeze to adapt to BeamSearchDecoder which use 2D logits
+            outputs = layers.squeeze(outputs, [1])
+        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        return outputs, new_states
 
     @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
+    def state_shape(self):
+        return [{
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(len(self.decoder.n_layer))]
 
-    def forward(self, input, label, length=None):
 
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
+class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    def __init__(self, cell, start_token, end_token, beam_size,
+                 var_dim_in_state):
+        super(TransformerBeamSearchDecoder,
+              self).__init__(cell, start_token, end_token, beam_size)
+        self.cell = cell
+        self.var_dim_in_state = var_dim_in_state
 
+    def _merge_batch_beams_with_var_dim(self, x):
+        # init length of cache is 0, and it increases with decoding carrying on,
+        # thus need to reshape elaborately
+        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
+        x = layers.transpose(x,
+                             list(range(var_dim_in_state, len(x.shape))) +
+                             list(range(0, var_dim_in_state)))
+        x = layers.reshape(
+            x, [0] * (len(x.shape) - var_dim_in_state
+                      ) + [self.batch_size * self.beam_size] +
+            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
+        x = layers.transpose(
+            x,
+            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
+            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
+        return x
 
-class CRFDecoding(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(CRFDecoding, self).__init__()
+    def _split_batch_beams_with_var_dim(self, x):
+        var_dim_size = layers.shape(x)[self.var_dim_in_state]
+        x = layers.reshape(
+            x, [-1, self.beam_size] +
+            [int(size)
+             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
+            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
+        return x
 
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
+    def step(self, time, inputs, states, **kwargs):
+        # compared to RNN, Transformer has 3D data at every decoding step
+        inputs = layers.reshape(inputs, [-1, 1])  # token
+        pos = layers.ones_like(inputs) * time  # pos
+        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
+                                    states.cell_states)
 
-    @property
-    def weight(self):
-        return self._transition
+        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
+                                                   **kwargs)
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
+                                         next_cell_states)
 
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states)
+        next_inputs, finished = (beam_search_output.predicted_ids,
+                                 beam_search_state.finished)
 
-    def forward(self, input, label=None, length=None):
+        return (beam_search_output, beam_search_state, next_inputs, finished)
 
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
 
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
 
-class GRUEncoder(Layer):
     def __init__(self,
-                 input_dim,
-                 grnn_hidden_dim,
-                 init_bound,
-                 num_layers=1,
-                 is_bidirection=False):
-        super(GRUEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.is_bidirection = is_bidirection
-        self.gru_list = []
-        self.gru_r_list = []
-        for i in range(num_layers):
-            self.basic_gru_cell = BasicGRUCell(
-                input_size=input_dim if i == 0 else input_dim * 2,
-                hidden_size=grnn_hidden_dim,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-init_bound, high=init_bound),
-                    regularizer=fluid.regularizer.L2DecayRegularizer(
-                        regularization_coeff=1e-4)))
-            self.gru_list.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    RNN(self.basic_gru_cell,
-                        is_reverse=False,
-                        time_major=False)))
-        if self.is_bidirection:
-            for i in range(num_layers):
-                self.basic_gru_cell_r = BasicGRUCell(
-                    input_size=input_dim if i == 0 else input_dim * 2,
-                    hidden_size=grnn_hidden_dim,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.UniformInitializer(
-                            low=-init_bound, high=init_bound),
-                        regularizer=fluid.regularizer.L2DecayRegularizer(
-                            regularization_coeff=1e-4)))
-                self.gru_r_list.append(
+                 process_cmd,
+                 d_model,
+                 dropout_rate,
+                 reused_layer_norm=None):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(
+                    lambda x, y: x + y if y is not None else x)
+            elif cmd == "n":  # add layer normalization
+                if reused_layer_norm is not None:
+                    layer_norm = reused_layer_norm
+                else:
+                    layer_norm = LayerNorm(
+                        normalized_shape=d_model,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.)),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.)))
+
+                self.functors.append(
                     self.add_sublayer(
-                        "gru_r_%d" % i,
-                        RNN(self.basic_gru_cell_r,
-                            is_reverse=True,
-                            time_major=False)))
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        layer_norm))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
 
-    def forward(self, input_feature, h0=None):
-        for i in range(self.num_layers):
-            pre_gru, pre_state = self.gru_list[i](input_feature)
-            if self.is_bidirection:
-                gru_r, r_state = self.gru_r_list[i](input_feature)
-                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
             else:
-                out = pre_gru
-            input_feature = out
-        return out
+                x = self.functors[i](x)
+        return x
 
 
-class SequenceTagging(Layer):
-    def __init__(self,
-                 vocab_size,
-                 num_labels,
-                 word_emb_dim=128,
-                 grnn_hidden_dim=128,
-                 emb_learning_rate=0.1,
-                 crf_learning_rate=0.1,
-                 bigru_num=2,
-                 init_bound=0.1):
-        super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+class MultiHeadAttention(Layer):
+    """
+    Multi-Head Attention
+    """
 
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.init_bound = 0.1
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.0,
+                 reused_query_fc=None,
+                 reused_key_fc=None,
+                 reused_value_fc=None,
+                 reused_proj_fc=None):
 
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
 
-        self.gru_encoder = GRUEncoder(
-            input_dim=self.grnn_hidden_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            init_bound=self.init_bound,
-            num_layers=self.bigru_num,
-            is_bidirection=True)
+        if reused_query_fc is not None:
+            self.q_fc = reused_query_fc
+        else:
+            self.q_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_key_fc is not None:
+            self.k_fc = reused_key_fc
+        else:
+            self.k_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_value_fc is not None:
+            self.v_fc = reused_value_fc
+        else:
+            self.v_fc = Linear(
+                input_dim=d_model,
+                output_dim=d_value * n_head,
+                bias_attr=False)
+        if reused_proj_fc is not None:
+            self.proj_fc = reused_proj_fc
+        else:
+            self.proj_fc = Linear(
+                input_dim=d_value * n_head,
+                output_dim=d_model,
+                bias_attr=False)
 
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
 
-        self.linear_chain_crf = LinearChainCRF(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
 
-        self.crf_decoding = CRFDecoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
 
-    def forward(self, word, lengths, target=None):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
 
-        bigru_output = self.gru_encoder(input_feature)
-        emission = self.fc(bigru_output)
+        return q, k, v
 
-        if target is not None:
-            crf_cost = self.linear_chain_crf(
-                input=emission, label=target, length=lengths)
-            avg_cost = fluid.layers.mean(x=crf_cost)
-            self.crf_decoding.weight = self.linear_chain_crf.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, avg_cost, lengths
-        else:
-            self.linear_chain_crf.weight = self.crf_decoding.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, lengths
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
 
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        if attn_bias is not None:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
 
-class StackedRNNCell(RNNCell):
-    def __init__(self, cells):
-        self.cells = []
-        for i, cell in enumerate(cells):
-            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+        out = layers.matmul(weights, v)
 
-    def forward(self, inputs, states):
-        pass
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
-    @staticmethod
-    def stack_param_attr(param_attr, n):
-        if isinstance(param_attr, (list, tuple)):
-            assert len(param_attr) == n, (
-                "length of param_attr should be %d when it is a list/tuple" %
-                n)
-            param_attrs = [
-                fluid.ParamAttr._to_attr(attr) for attr in param_attr
-            ]
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            for i in range(n):
-                attr_i = copy.deepcopy(attr)
-                if attr.name:
-                    attr_i.name = attr_i.name + "_" + str(i)
-                param_attrs.append(attr_i)
-        return param_attrs
+        # project to output
+        out = self.proj_fc(out)
+        return out
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+    def cal_kv(self, keys, values):
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
 
 
-class StackedLSTMCell(RNNCell):
+class FFN(Layer):
     """
+    Feed-Forward Network
     """
 
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedLSTMCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 fc1_act="relu",
+                 reused_fc1=None,
+                 reused_fc2=None):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        if reused_fc1 is not None:
+            self.fc1 = reused_fc1
+        else:
+            self.fc1 = Linear(
+                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        if reused_fc2 is not None:
+            self.fc2 = reused_fc2
+        else:
+            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
 
-        self.cells = []
-        for i in range(num_layers):
-            if forget_bias is True:
-                bias_attrs[
-                    i].initializer = fluid.initializer.NumpyArrayInitializer(
-                        np.concatenate(
-                            np.zeros(2 * hidden_size),
-                            np.ones(hidden_size), np.zeros(hidden_size))
-                        .astype(dtype))
-                forget_bias = 0.0
-            self.cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        forget_bias=forget_bias,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+    def forward(self, x):
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            out, new_state = cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else out
-            new_states.append(new_state)
+
+class TransformerEncoderLayer(Layer):
+    """
+    EncoderLayer
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu",
+                 reused_pre_selatt_layernorm=None,
+                 reused_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
+            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
+            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       fc1_act=ffn_fc1_act,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+
+
+class TransformerEncoder(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+
+        super(TransformerEncoder, self).__init__()
+
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self.processer(enc_output)
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 reused_pre_selfatt_layernorm=None,
+                 reused_self_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_crossatt_layernorm=None,
+                 reused_cross_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_crossatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selfatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_self_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_self_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_crossatt_layernorm)
+        self.cross_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_cross_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_cross_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_cross_multihead_att_weights[
+                "reused_proj_fc"])
+        self.postprocesser2 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_crossatt_layernorm)
+
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                cache=None):
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+
+        return ffn_output
+
+
+class TransformerDecoder(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+                 prepostprocess_dropout, attention_dropout, relu_dropout,
+                 preprocess_cmd, postprocess_cmd):
+        super(TransformerDecoder, self).__init__()
+
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                caches=None):
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, None
+                                       if caches is None else caches[i])
+            dec_input = dec_output
+
+        return self.processer(dec_output)
+
+    def prepare_static_cache(self, enc_output):
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
+
+    def prepare_incremental_cache(self, enc_output):
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
+
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUCell(RNNCell):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False):
+        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.fc_layer = Linear(
+            input_size, hidden_size * 3, param_attr=param_attr)
+
+        self.gru_unit = GRUUnit(
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+    def forward(self, inputs, states):
+        # for GRUCell, `step_outputs` and `new_states` both are hidden
+        x = self.fc_layer(inputs)
+        hidden, _, _ = self.gru_unit(x, states)
+        return hidden, hidden
+
+    @property
+    def state_shape(self):
+        return [self.hidden_size]
+
+
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUEncoderCell(RNNCell):
+    def __init__(self,
+                 num_layers,
+                 input_size,
+                 hidden_size,
+                 dropout_prob=0.,
+                 init_scale=0.1):
+        super(GRUEncoderCell, self).__init__()
+        self.dropout_prob = dropout_prob
+        # use add_sublayer to add multi-layers
+        self.gru_cells = []
+        for i in range(num_layers):
+            self.gru_cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    #BasicGRUCell(
+                    GRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.UniformInitializer(
+                                low=-init_scale, high=init_scale)))))
+
+    def forward(self, step_input, states):
+        new_states = []
+        for i, gru_cell in enumerate(self.gru_cells):
+            out, state = gru_cell(step_input, states[i])
+            step_input = layers.dropout(
+                out,
+                self.dropout_prob,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout_prob > 0 else out
+            new_states.append(step_input)
         return step_input, new_states
 
     @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+    def state_shape(self):
+        return [cell.state_shape for cell in self.gru_cells]
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
+        super(BiGRU, self).__init__()
+        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                      init_bound),
+                       is_reverse=False,
+                       time_major=False)
+
+        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                        init_bound),
+                         is_reverse=True,
+                         time_major=False)
+
+    def forward(self, input_feature):
+        pre_gru, pre_state = self.gru(input_feature)
+        gru_r, r_state = self.gru_r(input_feature)
+        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+        return bi_merge
+
+
+class LinearChainCRF(Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
+
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
 
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
 
-class LSTM(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(LSTM, self).__init__()
-        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                    activation, forget_bias, num_layers,
-                                    dropout, param_attr, bias_attr, dtype)
-        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+    def forward(self, input, label, length=None):
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        return self.lstm(inputs, initial_states, sequence_length)
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            },
+            attrs={"is_test": self._is_test, })
+        return log_likelihood
 
 
-class BidirectionalRNN(Layer):
-    def __init__(self,
-                 cell_fw,
-                 cell_bw,
-                 merge_mode='concat',
-                 time_major=False,
-                 cell_cls=None,
-                 **kwargs):
-        super(BidirectionalRNN, self).__init__()
-        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
-        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
-        if merge_mode == 'concat':
-            self.merge_func = lambda x, y: layers.concat([x, y], -1)
-        elif merge_mode == 'sum':
-            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
-        elif merge_mode == 'ave':
-            self.merge_func = lambda x, y: layers.scale(
-                layers.elementwise_add(x, y), 0.5)
-        elif merge_mode == 'mul':
-            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
-        elif merge_mode == 'zip':
-            self.merge_func = lambda x, y: (x, y)
-        elif merge_mode is None:
-            self.merge_func = None
-        else:
-            raise ValueError('Unsupported value for `merge_mode`: %s' %
-                             merge_mode)
+class CRFDecoding(Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(CRFDecoding, self).__init__()
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if isinstance(initial_states, (list, tuple)):
-            assert len(
-                initial_states
-            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
-        else:
-            initial_states = [initial_states, initial_states]
-        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
-                                            sequence_length)
-        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
-                                            sequence_length)
-        outputs = map_structure(
-            self.merge_func, outputs_fw,
-            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
-        return outputs, (states_fw, states_bw)
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
 
-    @staticmethod
-    def bidirect_param_attr(param_attr):
-        if isinstance(param_attr, (list, tuple)):
-            assert len(
-                param_attr
-            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
-            param_attrs = param_attr
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            attr_fw = copy.deepcopy(attr)
-            if attr.name:
-                attr_fw.name = attr_fw.name + "_fw"
-            param_attrs.append(attr_fw)
-            attr_bw = copy.deepcopy(attr)
-            if attr.name:
-                attr_bw.name = attr_bw.name + "_bw"
-            param_attrs.append(attr_bw)
-        return param_attrs
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
 
+    def forward(self, input, label=None, length=None):
 
-class BidirectionalLSTM(Layer):
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]},
+            attrs={"is_test": self._is_test, })
+        return viterbi_path
+
+
+class GRUEncoder(Layer):
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
+                 input_dim,
+                 grnn_hidden_dim,
+                 init_bound,
                  num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalLSTM, self).__init__()
+                 is_bidirection=False):
+        super(GRUEncoder, self).__init__()
         self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[0], bias_attrs[0],
-                                      dtype)
-            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[1], bias_attrs[1],
-                                      dtype)
-            self.lstm = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
-
-            # maybe design cell including both forward and backward later
-            self.lstm = []
+        self.is_bidirection = is_bidirection
+        self.gru_list = []
+        self.gru_r_list = []
+        for i in range(num_layers):
+            self.basic_gru_cell = BasicGRUCell(
+                input_size=input_dim if i == 0 else input_dim * 2,
+                hidden_size=grnn_hidden_dim,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-init_bound, high=init_bound),
+                    regularizer=fluid.regularizer.L2DecayRegularizer(
+                        regularization_coeff=1e-4)))
+            self.gru_list.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    RNN(self.basic_gru_cell,
+                        is_reverse=False,
+                        time_major=False)))
+        if self.is_bidirection:
             for i in range(num_layers):
-                cell_fw = StackedLSTMCell(
-                    input_size if i == 0 else (hidden_size * 2
-                                               if merge_mode == 'concat' else
-                                               hidden_size), hidden_size,
-                    gate_activation, activation, forget_bias, 1, dropout,
-                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
-                cell_bw = StackedLSTMCell(
-                    input_size if i == 0 else (hidden_size * 2
-                                               if merge_mode == 'concat' else
-                                               hidden_size), hidden_size,
-                    gate_activation, activation, forget_bias, 1, dropout,
-                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
-                self.lstm.append(
+                self.basic_gru_cell_r = BasicGRUCell(
+                    input_size=input_dim if i == 0 else input_dim * 2,
+                    hidden_size=grnn_hidden_dim,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.UniformInitializer(
+                            low=-init_bound, high=init_bound),
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=1e-4)))
+                self.gru_r_list.append(
                     self.add_sublayer(
-                        "lstm_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
+                        "gru_r_%d" % i,
+                        RNN(self.basic_gru_cell_r,
+                            is_reverse=True,
+                            time_major=False)))
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if not self.merge_each_layer:
-            return self.lstm(inputs, initial_states, sequence_length)
-        else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
+    def forward(self, input_feature, h0=None):
+        for i in range(self.num_layers):
+            pre_gru, pre_state = self.gru_list[i](input_feature)
+            if self.is_bidirection:
+                gru_r, r_state = self.gru_r_list[i](input_feature)
+                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
             else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.lstm[i](inputs, initial_states[i],
-                                               sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
-
+                out = pre_gru
+            input_feature = out
+        return out
 
-class StackedGRUCell(RNNCell):
-    """
-    """
 
+class SequenceTagging(Layer):
     def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedGRUCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    BasicGRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+                 vocab_size,
+                 num_labels,
+                 word_emb_dim=128,
+                 grnn_hidden_dim=128,
+                 emb_learning_rate=0.1,
+                 crf_learning_rate=0.1,
+                 bigru_num=2,
+                 init_bound=0.1):
+        super(SequenceTagging, self).__init__()
+        """
+        define the sequence tagging network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
 
-    def forward(self, step_input, states):
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            out, new_state = cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else out
-            new_states.append(new_state)
-        return step_input, new_states
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = grnn_hidden_dim
+        self.emb_lr = emb_learning_rate
+        self.crf_lr = crf_learning_rate
+        self.bigru_num = bigru_num
+        self.init_bound = 0.1
 
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.cells]
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
 
+        self.gru_encoder = GRUEncoder(
+            input_dim=self.grnn_hidden_dim,
+            grnn_hidden_dim=self.grnn_hidden_dim,
+            init_bound=self.init_bound,
+            num_layers=self.bigru_num,
+            is_bidirection=True)
 
-class GRU(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(GRU, self).__init__()
-        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                  activation, num_layers, dropout, param_attr,
-                                  bias_attr, dtype)
-        self.gru = RNN(gru_cell, is_reverse, time_major)
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        return self.gru(inputs, initial_states, sequence_length)
+        self.linear_chain_crf = LinearChainCRF(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
 
+        self.crf_decoding = CRFDecoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
 
-class BidirectionalGRU(Layer):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalGRU, self).__init__()
-        self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[0], bias_attrs[0], dtype)
-            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[1], bias_attrs[1], dtype)
-            self.gru = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
+    def forward(self, word, lengths, target=None):
+        """
+        Configure the network
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
 
-            # maybe design cell including both forward and backward later
-            self.gru = []
-            for i in range(num_layers):
-                cell_fw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, fw_param_attrs[i],
-                                         fw_bias_attrs[i], dtype)
-                cell_bw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, bw_param_attrs[i],
-                                         bw_bias_attrs[i], dtype)
-                self.gru.append(
-                    self.add_sublayer(
-                        "gru_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
+        bigru_output = self.gru_encoder(input_feature)
+        emission = self.fc(bigru_output)
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        if not self.merge_each_layer:
-            return self.gru(inputs, initial_states, sequence_length)
+        if target is not None:
+            crf_cost = self.linear_chain_crf(
+                input=emission, label=target, length=lengths)
+            avg_cost = fluid.layers.mean(x=crf_cost)
+            self.crf_decoding.weight = self.linear_chain_crf.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, avg_cost, lengths
         else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.gru[i](inputs, initial_states[i],
-                                              sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
+            self.linear_chain_crf.weight = self.crf_decoding.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, lengths

From cf752eba2840edd5ccbf327bbe8506b4123116f2 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 11 May 2020 02:26:54 +0800
Subject: [PATCH 05/16] Add api docs for TransformerCell and
 TransformerBeamSearchDecoder.

---
 hapi/text/text.py | 322 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 304 insertions(+), 18 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index 5327bbd..b5a849d 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -1856,7 +1856,7 @@ class GRU(Layer):
 
 
     Parameters:
-        input_size (int): The input size for the first GRU cell.
+        input_size (int): The input feature size for the first GRU cell.
         hidden_size (int): The hidden size for every GRU cell.
         gate_activation (function, optional): The activation function for gates
             of GRU, that is :math:`act_g` in the formula. Default: None,
@@ -1971,7 +1971,7 @@ class BidirectionalGRU(Layer):
 
 
     Parameters:
-        input_size (int): The input size for the first GRU cell.
+        input_size (int): The input feature size  for the first GRU cell.
         hidden_size (int): The hidden size for every GRU cell.
         gate_activation (function, optional): The activation function for gates
             of GRU, that is :math:`act_g` in the formula. Default: None,
@@ -2346,8 +2346,59 @@ def _maybe_copy(state, new_state, step_mask):
 
 class TransformerCell(Layer):
     """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
+    TransformerCell wraps a Transformer decoder producing logits from `inputs`
+    composed by ids and position.
+
+    Parameters:
+        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
+            includes a embedding layer accepting ids and positions instead of embeddings
+            and includes a output layer transforming decoder output features to logits.
+        embedding_fn(function, optional): A callable that accepts ids and position
+            as arguments and return embeddings as input of `decoder`. It can be
+            None if `decoder` includes a embedding layer. Default None.
+        output_fn(callable, optional): A callable applid on `decoder` output to
+            transform decoder output features to get logits. Mostly it is a Linear
+            layer with vocabulary size. It can be None if `decoder` includes a
+            output layer. Default None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+
+            embedder = Embedding(size=[1000, 128])
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    bos_id=0,
+                    eos_id=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 64))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            states = cell.get_initial_states(encoder_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, self.beam_size)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
     """
 
     def __init__(self, decoder, embedding_fn=None, output_fn=None):
@@ -2356,11 +2407,56 @@ def __init__(self, decoder, embedding_fn=None, output_fn=None):
         self.embedding_fn = embedding_fn
         self.output_fn = output_fn
 
-    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
-                static_caches):
+    def forward(self,
+                inputs,
+                states=None,
+                enc_output=None,
+                trg_slf_attn_bias=None,
+                trg_src_attn_bias=None,
+                static_caches=[]):
+        """
+        Produces logits from `inputs` composed by ids and positions.
+
+        Parameters:
+            inputs(tuple): A tuple includes target ids and positions. The two
+                tensors both have int64 data type and with 2D shape 
+                `[batch_size, sequence_length]` where `sequence_length` is 1
+                for inference.
+            states(list): It caches the multi-head attention intermediate results
+                of history decoding steps. It is a list of dict where the length
+                of list is decoder layer number, and each dict has `k` and `v` as
+                keys and values are cached results. Default None
+            enc_output(Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data type
+                should be float32 or float64.
+            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
+                attention to mask out attention on unwanted target positions. It
+                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None for inference. The data type should
+                be float32 or float64.
+            trg_src_attn_bias(Variable, optional): A tensor used in decoder encoder
+                cross attention to mask out unwanted attention on source (encoder output).
+                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64.
+            static_caches(list): It stores the multi-head attention intermediate
+                results of encoder output. It is a list of dict where the length
+                of list is decoder layer number, and each dict has `static_k` and
+                `static_v` as keys and values are stored results. Default empty list
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
+                is a float32 or float64 3D tensor representing logits shaped \
+                `[batch_size, sequence_length, vocab_size]`. `new_states has \
+                the same structure and date type with `states` while the length \
+                is one larger since the intermediate results of current step are \
+                concatenated into it.
+        """
         trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
+        if states and static_caches:
+            for cache, static_cache in zip(states, static_caches):
+                cache.update(static_cache)
         if self.embedding_fn is not None:
             dec_input = self.embedding_fn(trg_word, trg_pos)
             outputs = self.decoder(dec_input, enc_output, None,
@@ -2370,14 +2466,30 @@ def forward(self, inputs, states, trg_src_attn_bias, enc_output,
                                    trg_src_attn_bias, states)
         if self.output_fn is not None:
             outputs = self.output_fn(outputs)
-        if len(outputs.shape) == 3:
-            # squeeze to adapt to BeamSearchDecoder which use 2D logits
-            outputs = layers.squeeze(outputs, [1])
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+
+        new_states = [{
+            "k": cache["k"],
+            "v": cache["v"]
+        } for cache in states] if states else states
         return outputs, new_states
 
     @property
     def state_shape(self):
+        """
+        States of TransformerCell cache the multi-head attention intermediate
+        results of history decoding steps, and have a increasing length as
+        decoding continued.
+        
+        `state_shape` of TransformerCell is used to initialize states. It is a
+        list of dict where the length of list is decoder layer, and each dict
+        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
+        separately. (-1 for batch size would be automatically inserted into shape).
+
+        Returns:
+            list: It is a list of dict where the length of list is decoder layer \
+                number, and each dict has `k` and `v` as keys and values are cached \
+                results.
+        """
         return [{
             "k": [self.decoder.n_head, 0, self.decoder.d_key],
             "v": [self.decoder.n_head, 0, self.decoder.d_value],
@@ -2385,6 +2497,60 @@ def state_shape(self):
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    """
+    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
+    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
+    and includes extra position data. And its `states` (caches) has increasing
+    length. These are not consistent with `BeamSearchDecoder`, thus subclass
+    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
+
+    Parameters:
+        cell(TransformerCell): An instance of `TransformerCell`.
+        start_token(int): The start token id.
+        end_token(int): The end token id.
+        beam_size(int): The beam width used in beam search.
+        var_dim_in_state(int): Indicate which dimension of states is variant.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+
+            embedder = Embedding(size=[1000, 128])
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    bos_id=0,
+                    eos_id=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 64))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            states = cell.get_initial_states(encoder_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, self.beam_size)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+
     def __init__(self, cell, start_token, end_token, beam_size,
                  var_dim_in_state):
         super(TransformerBeamSearchDecoder,
@@ -2393,6 +2559,18 @@ def __init__(self, cell, start_token, end_token, beam_size,
         self.var_dim_in_state = var_dim_in_state
 
     def _merge_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
+        tensor with shape `[batch_size * beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
         # init length of cache is 0, and it increases with decoding carrying on,
         # thus need to reshape elaborately
         var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
@@ -2410,6 +2588,18 @@ def _merge_batch_beams_with_var_dim(self, x):
         return x
 
     def _split_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
+        tensor with shape `[batch_size, beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.     
+        """
         var_dim_size = layers.shape(x)[self.var_dim_in_state]
         x = layers.reshape(
             x, [-1, self.beam_size] +
@@ -2419,6 +2609,38 @@ def _split_batch_beams_with_var_dim(self, x):
         return x
 
     def step(self, time, inputs, states, **kwargs):
+        """
+        Perform a beam search decoding step, which uses `cell` to get probabilities,
+        and follows a beam search step to calculate scores and select candidate
+        token ids.
+
+        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
+        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
+        position data as inputs to `cell`.
+
+        Parameters:
+            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            inputs(Variable): A tensor variable. It is same as `initial_inputs`
+                returned by `initialize()` for the first decoding step and
+                `next_inputs` returned by `step()` for the others. It is a int64
+                id tensor with shape `[batch_size * beam_size]`
+            states(Variable): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+            **kwargs: Additional keyword arguments, provided by the caller. 
+        
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
+                `beam_search_state` and `next_inputs` have the same structure, \
+                shape and data type as the input arguments `states` and `inputs` separately. \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
+                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
+        """
         # compared to RNN, Transformer has 3D data at every decoding step
         inputs = layers.reshape(inputs, [-1, 1])  # token
         pos = layers.ones_like(inputs) * time  # pos
@@ -2427,6 +2649,11 @@ def step(self, time, inputs, states, **kwargs):
 
         cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
                                                    **kwargs)
+
+        # squeeze to adapt to BeamSearchDecoder which use 2D logits
+        cell_outputs = map_structure(
+            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
+            cell_outputs)
         cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
         next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
                                          next_cell_states)
@@ -2715,7 +2942,66 @@ def forward(self, enc_input, attn_bias):
 
 class TransformerEncoder(Layer):
     """
-    encoder
+    TransformerEncoder is a stack of N encoder layers.
+
+    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in the multi-head attention(MHA).
+        d_key (int): The number of heads in the multi-head attention. Mostly .
+        d_value (int): The number of heads in the multiheadattention.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used in FFN
+            in MHA to drop some attention target. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied. It should be a string
+            that includes `d`, `a`, `n` as , where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization.
+            network. Default `n`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoder
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = TransformerEncoder(n_layers=2, input_size=32, hidden_size=64,)
+            outputs, _ = gru(inputs)  # [2, 4, 32]
     """
 
     def __init__(self,
@@ -2725,9 +3011,9 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
                  ffn_fc1_act="relu"):
@@ -2908,8 +3194,8 @@ def forward(self,
                 caches=None):
         for i, decoder_layer in enumerate(self.decoder_layers):
             dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
+                                       cross_attn_bias, caches[i]
+                                       if caches else None)
             dec_input = dec_output
 
         return self.processer(dec_output)

From 48d8a3903318b9447d2522a99562178d78d450c4 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 11 May 2020 11:42:05 +0800
Subject: [PATCH 06/16] Add api docs for TransformerEncoder.

---
 hapi/text/text.py | 108 ++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 43 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index b5a849d..d320b32 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -2367,10 +2367,20 @@ class TransformerCell(Layer):
 
             import paddle
             import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding
             from paddle.incubate.hapi.text import TransformerCell
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
 
-            embedder = Embedding(size=[1000, 128])
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, inputs):
+                    word, position = inputs
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
             output_layer = Linear(128, 1000)
             decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
             transformer_cell = TransformerCell(decoder, embedder, output_layer)
@@ -2392,7 +2402,7 @@ class TransformerCell(Layer):
             enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
                 enc_output, beam_size=4)
             trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, self.beam_size)
+                trg_src_attn_bias, beam_size=4)
             static_caches = decoder.prepare_static_cache(enc_output)
             outputs = dynamic_decoder(
                 inits=caches,
@@ -2517,10 +2527,20 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
 
             import paddle
             import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding
             from paddle.incubate.hapi.text import TransformerCell
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
 
-            embedder = Embedding(size=[1000, 128])
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, inputs):
+                    word, position = inputs
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
             output_layer = Linear(128, 1000)
             decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
             transformer_cell = TransformerCell(decoder, embedder, output_layer)
@@ -2542,7 +2562,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
             enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
                 enc_output, beam_size=4)
             trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, self.beam_size)
+                trg_src_attn_bias, beam_size=4)
             static_caches = decoder.prepare_static_cache(enc_output)
             outputs = dynamic_decoder(
                 inits=caches,
@@ -2944,53 +2964,33 @@ class TransformerEncoder(Layer):
     """
     TransformerEncoder is a stack of N encoder layers.
 
-    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
-    sequence.
-
     Parameters:
         n_layer (int): The number of encoder layers to be stacked.
-        n_head (int): The number of heads in the multi-head attention(MHA).
-        d_key (int): The number of heads in the multi-head attention. Mostly .
-        d_value (int): The number of heads in the multiheadattention.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
         d_model (int): The expected feature size in the input and output.
         d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
         prepostprocess_dropout (float, optional): The dropout probability used
             in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
         attention_dropout (float, optional): The dropout probability used
             in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used in FFN
-            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
         preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied. It should be a string
-            that includes `d`, `a`, `n` as , where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization.
-            network. Default `n`.
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
         ffn_fc1_act (str, optional): The activation function in the feedforward
             network. Default relu.
          
-        dropout(float|list|tuple, optional): The dropout probability after each
-            GRU. It also can be a list or tuple, including dropout probabilities
-            for the corresponding GRU. Default 0.0
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
     Examples:
 
         .. code-block:: python
@@ -2999,9 +2999,12 @@ class TransformerEncoder(Layer):
             import paddle.fluid as fluid
             from paddle.incubate.hapi.text import TransformerEncoder
 
-            inputs = paddle.rand((2, 4, 32))
-            gru = TransformerEncoder(n_layers=2, input_size=32, hidden_size=64,)
-            outputs, _ = gru(inputs)  # [2, 4, 32]
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 32))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
+            enc_output = encoder(inputs, attn_bias)  # [2, 4, 32]
     """
 
     def __init__(self,
@@ -3040,7 +3043,26 @@ def __init__(self,
         self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
                                              prepostprocess_dropout)
 
-    def forward(self, enc_input, attn_bias):
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a stack of N Transformer encoder layers on input sequences.
+
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None for inference. The data type should
+                be float32 or float64. It can be None when nothing wanted to be
+                masked out. Default None
+
+        Returns:
+            Variable: The output of Transformer encoder. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
         for encoder_layer in self.encoder_layers:
             enc_output = encoder_layer(enc_input, attn_bias)
             enc_input = enc_output

From b3a1ddf85f83c0b0a7c2c3547603c93aac3dee02 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 11 May 2020 14:39:26 +0800
Subject: [PATCH 07/16] Add CNN related apis in text.py

---
 hapi/tests/test_text.py |  34 +++++
 hapi/text/__init__.py   |  14 +-
 hapi/text/text.py       | 280 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 325 insertions(+), 3 deletions(-)

diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index 6f0d014..977656c 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -711,5 +711,39 @@ def test_check_output_merge1(self):
         self.check_output()
 
 
+class TestCNNEncoder(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 32, 8)  # [N, C, H]
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, num_channels, num_filters, num_layers):
+        self.cnn_encoder = CNNEncoder(
+            num_layers=2,
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[2, 3],
+            pool_size=[7, 6])
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.cnn_encoder(inputs)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[-1].shape[1], None],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output_merge0(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
index 890e989..80568e3 100644
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,15 +16,27 @@
 from hapi.text.text import BasicLSTMCell as BasicLSTMCell
 from hapi.text.text import BasicGRUCell as BasicGRUCell
 from hapi.text.text import RNN as RNN
+from hapi.text.text import StackedLSTMCell as StackedLSTMCell
+from hapi.text.text import LSTM as LSTM
+from hapi.text.text import BidirectionalLSTM as BidirectionalLSTM
+from hapi.text.text import StackedGRUCell as StackedGRUCell
+from hapi.text.text import GRU as GRU
+from hapi.text.text import BidirectionalGRU as BidirectionalGRU
 from hapi.text.text import DynamicDecode as DynamicDecode
 from hapi.text.text import BeamSearchDecoder as BeamSearchDecoder
+
+from hapi.text.text import Conv1dPoolLayer as Conv1dPoolLayer
+from hapi.text.text import CNNEncoder as CNNEncoder
+
 from hapi.text.text import MultiHeadAttention as MultiHeadAttention
 from hapi.text.text import FFN as FFN
 from hapi.text.text import TransformerEncoderLayer as TransformerEncoderLayer
 from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
 from hapi.text.text import TransformerEncoder as TransformerEncoder
 from hapi.text.text import TransformerDecoder as TransformerDecoder
+from hapi.text.text import TransformerCell as TransformerCell
 from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
+
 from hapi.text.text import GRUCell as GRUCell
 from hapi.text.text import GRUEncoderCell as GRUEncoderCell
 from hapi.text.text import BiGRU as BiGRU
diff --git a/hapi/text/text.py b/hapi/text/text.py
index d320b32..e338f0c 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -37,7 +37,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
+from paddle.fluid.dygraph import Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
 from paddle.fluid.data_feeder import convert_dtype
 
 from paddle.fluid import layers
@@ -57,6 +57,8 @@
     'BidirectionalGRU',
     'DynamicDecode',
     'BeamSearchDecoder',
+    'Conv1dPoolLayer',
+    'CNNEncoder',
     'MultiHeadAttention',
     'FFN',
     'TransformerEncoderLayer',
@@ -2171,7 +2173,7 @@ class DynamicDecode(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
 
             vocab_size, d_model, = 100, 32
             encoder_output = paddle.rand((2, 4, d_model))
@@ -2344,6 +2346,280 @@ def _maybe_copy(state, new_state, step_mask):
                 **kwargs)
 
 
+class Conv1dPoolLayer(Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1DPoolLayer``
+    class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .
+    For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates
+    the output based on the input, filter and strides, paddings, dilations, groups,
+    global_pooling, pool_type, ceil_mode, exclusive parameters.
+
+    Parameters:
+        num_channels (int): The number of channels in the input data.
+        num_filters(int): The number of filters. It is the same as the output channels.
+        filter_size (int): The filter size of Conv1DPoolLayer.       
+        pool_size (int): The pooling size of Conv1DPoolLayer.
+        conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer.
+            Default: 1
+        pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer.
+            Default: 1
+        conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer.
+            Default: 0
+        pool_padding (int): The padding of pool layer in Conv1DPoolLayer.
+            Default: 0
+        act (str): Activation type for conv layer, if it is set to None, activation
+            is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        dilation (int): The dilation size of the conv Layer. Default: 1.
+        groups (int): The groups number of the conv Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the
+            first half of the filters is only connected to the first half of the
+            input channels, while the second half of the filters is only connected
+            to the second half of the input channels. Default: 1.
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+                `pool_size` and `pool_padding` would be ignored. Default: False
+        ceil_mode (bool, optional): Whether to use the ceil function to calculate output 
+                height and width.False is the default. If it is set to False, the floor function 
+                will be used. Default: False.
+        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. 
+                Default: True.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import Conv1dPoolLayer
+
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 4))
+            cov2d = Conv1dPoolLayer(num_channels=32,
+                                    num_filters=64,
+                                    filter_size=2,
+                                    pool_size=2)
+            output = cov2d(input)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 dilation=1,
+                 groups=None,
+                 ceil_mode=False,
+                 exclusive=True,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(Conv1dPoolLayer, self).__init__()
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[filter_size, 1],
+            stride=[conv_stride, 1],
+            padding=[conv_padding, 0],
+            dilation=[dilation, 1],
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act)
+        self._pool2d = Pool2D(
+            pool_size=[pool_size, 1],
+            pool_type=pool_type,
+            pool_stride=[pool_stride, 1],
+            pool_padding=[pool_padding, 0],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)
+
+    def forward(self, input):
+        """
+        Performs conv1d and pool1d on the input.
+
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        
+        Returns:
+            Variable: The 3-D output tensor after conv and pool. It has the same \
+                data type as input.
+        """
+        x = fluid.layers.unsqueeze(input, axes=[-1])
+        x = self._conv2d(x)
+        x = self._pool2d(x)
+        x = fluid.layers.squeeze(x, axes=[-1])
+        return x
+
+
+class CNNEncoder(Layer):
+    """
+    This interface is used to construct a callable object of the ``CNNEncoder``
+    class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` .
+    ``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters.
+    The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every 
+    ``Conv1dPoolLayer`` will concat at the channel dimension as the final output.
+
+    Parameters:
+        num_channels(int|list|tuple): The number of channels in the input data. If
+            `num_channels` is a list or tuple, the length of `num_channels` must
+            equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's
+            `num_channels` are the value of `num_channels`. 
+        num_filters(int|list|tuple): The number of filters. It is the same as the
+            output channels. If `num_filters` is a list or tuple, the length of
+            `num_filters` must equal `num_layers`. If `num_filters` is a int,
+            all conv1dpoollayer's `num_filters` are the value of `num_filters`.
+        filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder.
+            If `filter_size` is a list or tuple, the length of `filter_size` must
+            equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's
+            `filter_size` are the value of `filter_size`. 
+        pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.
+            If `pool_size` is a list or tuple, the length of `pool_size` must equal
+            `num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size`
+            are the value of `pool_size`.
+        num_layers(int): The number of conv1dpoolLayer used in CNNEncoder.
+        conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer.
+            If `conv_stride` is a list or tuple, the length of `conv_stride` must
+            equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride`
+            are the value of `conv_stride`. Default: 1
+        pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer.
+            If `pool_stride` is a list or tuple, the length of `pool_stride` must
+            equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride`
+            are the value of `pool_stride`. Default: 1
+        conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.
+            If `conv_padding` is a list or tuple, the length of `conv_padding` must
+            equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding`
+            are the value of `conv_padding`. Default: 0
+        pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer.
+            If `pool_padding` is a list or tuple, the length of `pool_padding` must
+            equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding`
+            are the value of `pool_padding`. Default: 0
+        act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None,
+            activation is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+            `pool_size` and `pool_padding` would be ignored. Default: False
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+    
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CNNEncoder
+
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 8))
+            cov_encoder = CNNEncoder(num_layers=2,
+                                     num_channels=32,
+                                     num_filters=64,
+                                     filter_size=[2, 3],
+                                     pool_size=[7, 6])
+            output = cov_encoder(input)  # [2, 128, 1]
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 num_layers=1,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 use_cudnn=False):
+        super(CNNEncoder, self).__init__()
+        self.num_layers = num_layers
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.pool_size = pool_size
+        self.conv_stride = conv_stride
+        self.pool_stride = pool_stride
+        self.conv_padding = conv_padding
+        self.pool_padding = pool_padding
+        self.use_cudnn = use_cudnn
+        self.act = act
+        self.pool_type = pool_type
+        self.global_pooling = global_pooling
+        self.conv1d_pool_layers = fluid.dygraph.LayerList([
+            Conv1dPoolLayer(
+                num_channels=self.num_channels if
+                isinstance(self.num_channels, int) else self.num_channels[i],
+                num_filters=self.num_filters
+                if isinstance(self.num_channels, int) else self.num_filters[i],
+                filter_size=self.filter_size
+                if isinstance(self.filter_size, int) else self.filter_size[i],
+                pool_size=self.pool_size
+                if isinstance(self.pool_size, int) else self.pool_size[i],
+                conv_stride=self.conv_stride
+                if isinstance(self.conv_stride, int) else self.conv_stride[i],
+                pool_stride=self.pool_stride
+                if isinstance(self.pool_stride, int) else self.pool_stride[i],
+                conv_padding=self.conv_padding
+                if isinstance(self.conv_padding,
+                              int) else self.conv_padding[i],
+                pool_padding=self.pool_padding
+                if isinstance(self.pool_padding,
+                              int) else self.pool_padding[i],
+                act=self.act[i]
+                if isinstance(self.act, (list, tuple)) else self.act,
+                pool_type=self.pool_type,
+                global_pooling=self.global_pooling,
+                use_cudnn=self.use_cudnn) for i in range(num_layers)
+        ])
+
+    def forward(self, input):
+        """
+        Performs multiple parallel conv1d and pool1d, and concat the results of
+        them at the channel dimension to produce the final output.
+
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        
+        Returns:
+            Variable: The 3-D output tensor produced by concatenating results of \
+                all Conv1dPoolLayer. It has the same data type as input.
+        """
+        res = [
+            conv1d_pool_layer(input)
+            for conv1d_pool_layer in self.conv1d_pool_layers
+        ]
+        out = fluid.layers.concat(input=res, axis=1)
+        return out
+
+
 class TransformerCell(Layer):
     """
     TransformerCell wraps a Transformer decoder producing logits from `inputs`

From 71c7ae7a4c3ff59f771ebaba5775651c4a917f83 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 12 May 2020 01:53:25 +0800
Subject: [PATCH 08/16] Add api docs for Transformer related apis.

---
 hapi/text/text.py | 487 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 451 insertions(+), 36 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index e338f0c..2e9d9b3 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -214,8 +214,7 @@ class BasicLSTMCell(RNNCell):
         h_{t} & = o_{t} act_c (c_{t})
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
-    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
-    for more details.
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
     Parameters:
         input_size (int): The input size in the LSTM cell.
@@ -547,8 +546,7 @@ class BasicGRUCell(RNNCell):
         h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
-    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_
-    for more details.
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
     Parameters:
         input_size (int): The input size for the first GRU cell.
@@ -2719,23 +2717,26 @@ def forward(self,
                 attention to mask out attention on unwanted target positions. It
                 is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
                 where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None for inference. The data type should
-                be float32 or float64.
-            trg_src_attn_bias(Variable, optional): A tensor used in decoder encoder
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. It can be None for inference. The data type should
+                be float32 or float64. Default None
+            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
                 cross attention to mask out unwanted attention on source (encoder output).
                 It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
                 where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64.
-            static_caches(list): It stores the multi-head attention intermediate
-                results of encoder output. It is a list of dict where the length
-                of list is decoder layer number, and each dict has `static_k` and
-                `static_v` as keys and values are stored results. Default empty list
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. The data type should be float32 or float64. Default None
+            static_caches(list): It stores projected results of encoder output
+                to be used as keys and values in decoder-encoder cross attention
+                It is a list of dict where the length of list is decoder layer
+                number, and each dict has `static_k` and `static_v` as keys and
+                values are stored results. Default empty list
 
         Returns:
             tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
                 is a float32 or float64 3D tensor representing logits shaped \
                 `[batch_size, sequence_length, vocab_size]`. `new_states has \
-                the same structure and date type with `states` while the length \
+                the same structure and data type with `states` while the length \
                 is one larger since the intermediate results of current step are \
                 concatenated into it.
         """
@@ -2830,7 +2831,7 @@ def forward(self, inputs):
                 max_step_num,
                 is_test=True)
             
-            enc_output = paddle.rand((2, 4, 64))
+            enc_output = paddle.rand((2, 4, 128))
             # cross attention bias: [batch_size, n_head, trg_len, src_len]
             trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
             # inputs for beam search on Transformer
@@ -3015,7 +3016,37 @@ def forward(self, x, residual=None):
 
 class MultiHeadAttention(Layer):
     """
-    Multi-Head Attention
+    MultiHead Attention mapps queries and a set of key-value pairs to outputs
+    by jointly attending to information from different representation subspaces,
+    as what multi-head indicates it performs multiple attention in parallel.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        dropout_rate (float, optional): The dropout probability used in MHA to
+            drop some attention target. Default 0.1
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import MultiHeadAttention
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = MultiHeadAttention(64, 64, 2, 128)
+            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3062,6 +3093,37 @@ def __init__(self,
                 bias_attr=False)
 
     def _prepare_qkv(self, queries, keys, values, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple attention in parallel. If `cache` is not None, using cached
+        results to reduce redundant calculations.
+
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
         if keys is None:  # self-attention
             keys, values = queries, queries
             static_kv = False
@@ -3097,7 +3159,47 @@ def _prepare_qkv(self, queries, keys, values, cache=None):
 
         return q, k, v
 
-    def forward(self, queries, keys, values, attn_bias, cache=None):
+    def forward(self,
+                queries,
+                keys=None,
+                values=None,
+                attn_bias=None,
+                cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias (Variable, optional): A tensor used in multi-head attention
+                to mask out attention on unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+
+        Returns:
+            Variable: The output of multi-head attention. It is a tensor \
+                that has the same shape and data type as `queries`.
+        """
         # compute q ,k ,v
         q, k, v = self._prepare_qkv(queries, keys, values, cache)
 
@@ -3122,6 +3224,25 @@ def forward(self, queries, keys, values, attn_bias, cache=None):
         return out
 
     def cal_kv(self, keys, values):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces for usage of subsequnt multiple attention in parallel.
+
+        Parameters:
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
         k = self.k_fc(keys)
         v = self.v_fc(values)
         k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
@@ -3133,7 +3254,17 @@ def cal_kv(self, keys, values):
 
 class FFN(Layer):
     """
-    Feed-Forward Network
+    A fully connected feed-forward network applied to each position separately
+    and identically. This consists of two linear transformations with a activation
+    and dropout in between.
+
+    Parameters:
+        d_inner_hid (int): The hidden size in the feedforward network(FFN).
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used after
+            activition. Default 0.1
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
     """
 
     def __init__(self,
@@ -3156,6 +3287,19 @@ def __init__(self,
             self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
 
     def forward(self, x):
+        """
+        Applies a fully connected feed-forward network on each position  of the
+        input sequences separately and identically.
+
+        Parameters:
+            x (Variable): The input of feed-forward network. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+
+        Returns:
+            Variable: The output of feed-forward network. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
         hidden = self.fc1(x)
         if self.dropout_rate:
             hidden = layers.dropout(
@@ -3166,7 +3310,50 @@ def forward(self, x):
 
 class TransformerEncoderLayer(Layer):
     """
-    EncoderLayer
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output.
+
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(2, 2, 64, 64, 128, 512)
+            enc_output = encoder_layer(inputs, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3175,9 +3362,9 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
                  ffn_fc1_act="relu",
@@ -3226,7 +3413,25 @@ def __init__(self,
                                                   prepostprocess_dropout,
                                                   reused_post_ffn_layernorm)
 
-    def forward(self, enc_input, attn_bias):
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a Transformer encoder layer on the input.
+
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+
+        Returns:
+            Variable: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
         attn_output = self.self_attn(
             self.preprocesser1(enc_input), None, None, attn_bias)
         attn_output = self.postprocesser1(attn_output, enc_input)
@@ -3276,11 +3481,11 @@ class TransformerEncoder(Layer):
             from paddle.incubate.hapi.text import TransformerEncoder
 
             # encoder input: [batch_size, src_len, d_model]
-            enc_input = paddle.rand((2, 4, 32))
+            enc_input = paddle.rand((2, 4, 128))
             # self attention bias: [batch_size, n_head, src_len, src_len]
             attn_bias = paddle.rand((2, 2, 4, 4))
             encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
-            enc_output = encoder(inputs, attn_bias)  # [2, 4, 32]
+            enc_output = encoder(inputs, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3331,9 +3536,8 @@ def forward(self, enc_input, attn_bias=None):
                 to mask out attention on unwanted positions, usually the paddings. It
                 is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
                 where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None for inference. The data type should
-                be float32 or float64. It can be None when nothing wanted to be
-                masked out. Default None
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
 
         Returns:
             Variable: The output of Transformer encoder. It is a tensor that has \
@@ -3348,7 +3552,58 @@ def forward(self, enc_input, attn_bias=None):
 
 class TransformerDecoderLayer(Layer):
     """
-    decoder
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output.
+
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoderLayer
+
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_bias,
+                                   cross_attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3438,9 +3693,41 @@ def __init__(self,
     def forward(self,
                 dec_input,
                 enc_output,
-                self_attn_bias,
-                cross_attn_bias,
+                self_attn_bias=None,
+                cross_attn_bias=None,
                 cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(dict, optional): It caches the multi-head attention intermediate
+                results of history decoding steps and encoder output. It is a dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+
+        Returns:
+            Variable: The output of Transformer decoder layer. It is a tensor \
+                that has the same shape and data type as `dec_input`.
+        """
         self_attn_output = self.self_attn(
             self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
         self_attn_output = self.postprocesser1(self_attn_output, dec_input)
@@ -3459,12 +3746,71 @@ def forward(self,
 
 class TransformerDecoder(Layer):
     """
-    decoder
+    TransformerDecoder is a stack of N decoder layers.
+
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoder
+
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            dec_output = decoder(dec_input,
+                                 enc_output,
+                                 self_attn_bias,
+                                 cross_attn_bias)  # [2, 4, 128]
     """
 
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
         super(TransformerDecoder, self).__init__()
 
         self.n_layer = n_layer
@@ -3487,9 +3833,42 @@ def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
     def forward(self,
                 dec_input,
                 enc_output,
-                self_attn_bias,
-                cross_attn_bias,
+                self_attn_bias=None,
+                cross_attn_bias=None,
                 caches=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs.
+
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(list, optional): It caches the multi-head attention intermediate results
+                of history decoding steps and encoder output. It is a list of dict
+                where the length of list is decoder layer number, and each dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+
+        Returns:
+            Variable: The output of Transformer decoder. It is a tensor that has \
+                the same shape and data type as `dec_input`.
+        """
         for i, decoder_layer in enumerate(self.decoder_layers):
             dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
                                        cross_attn_bias, caches[i]
@@ -3499,6 +3878,22 @@ def forward(self,
         return self.processer(dec_output)
 
     def prepare_static_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `static_k`, `statkc_v` as keys, and values are projected
+        results of encoder output to be used as keys and values in decoder-encoder
+        cross (multi-head) attention. Used in inference.
+
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \
+                and values are projected results of encoder output to be used as \
+                keys and values in decoder-encoder cross (multi-head) attention.
+        """
         return [
             dict(
                 zip(("static_k", "static_v"),
@@ -3507,6 +3902,26 @@ def prepare_static_cache(self, enc_output):
         ]
 
     def prepare_incremental_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `k`, `v` as keys, and values are empty tensors with shape
+        `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`,
+        representing the decoder self (multi-head) attention intermediate results,
+        and 0 is the initial length which would increase as inference decoding
+        continued. Used in inference.
+
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64. Actually, it is used to provide batch
+                size for Transformer initial states(caches), thus any tensor has
+                wanted batch size can be used here.
+
+        Returns:
+            list: A list of dict. Each dict has `k`, `v` as keys, and values are \
+                empty tensors representing intermediate results of history decoding \
+                steps in decoder self (multi-head) attention at time step 0.
+        """
         return [{
             "k": layers.fill_constant_batch_size_like(
                 input=enc_output,

From f75b39e88327c009181f70cf570589e42938efbe Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 12 May 2020 20:35:24 +0800
Subject: [PATCH 09/16] Add api docs for sequence labeling related apis.

---
 hapi/text/text.py | 328 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 302 insertions(+), 26 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index 2e9d9b3..860d3e7 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -2969,13 +2969,37 @@ def step(self, time, inputs, states, **kwargs):
 ### Transformer Modules ###
 class PrePostProcessLayer(Layer):
     """
-    PrePostProcessLayer
+    PrePostProcessLayer is used before/after each multi-head attention(MHA) and
+    feed-forward network(FFN) sub-layer to perform some specific process on
+    inputs/outputs.
+
+    Parameters:
+        process_cmd (str): The process applied before/after each MHA and
+            FFN sub-layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization.
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float): The dropout probability if the process includes
+            dropout. Default 0.1
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import PrePostProcessLayer
+
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            process = PrePostProcessLayer('n', 32)
+            out = process(x)  # [2, 4, 32]
     """
 
     def __init__(self,
                  process_cmd,
                  d_model,
-                 dropout_rate,
+                 dropout_rate=0.1,
                  reused_layer_norm=None):
         super(PrePostProcessLayer, self).__init__()
         self.process_cmd = process_cmd
@@ -3006,6 +3030,21 @@ def __init__(self,
                                      if dropout_rate else x)
 
     def forward(self, x, residual=None):
+        """
+        Applies `process_cmd` specified process on `x`.
+
+        Parameters:
+            x (Variable): The tensor to be processed. The data type should be float32
+                or float64. The shape is `[batch_size, sequence_length, d_model]`.
+                
+            residual (Variable, optional): Only used if the process includes
+                residual connection. It has the same shape and data type as `x`.
+                Default None
+
+        Returns:
+            Variable: The processed tensor. It has the same shape and data type \
+                    as `x`.
+        """
         for i, cmd in enumerate(self.process_cmd):
             if cmd == "a":
                 x = self.functors[i](x, residual)
@@ -3205,7 +3244,7 @@ def forward(self,
 
         # scale dot product attention
         product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+            x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = layers.softmax(product)
@@ -3265,12 +3304,25 @@ class FFN(Layer):
             activition. Default 0.1
         ffn_fc1_act (str, optional): The activation function in the feedforward
             network. Default relu.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import FFN
+
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            ffn = FFN(128, 32)
+            out = ffn(x)  # [2, 4, 32]
     """
 
     def __init__(self,
                  d_inner_hid,
                  d_model,
-                 dropout_rate,
+                 dropout_rate=0.1,
                  fc1_act="relu",
                  reused_fc1=None,
                  reused_fc2=None):
@@ -4032,13 +4084,78 @@ def forward(self, input_feature):
 
 
 class LinearChainCRF(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(LinearChainCRF, self).__init__()
+    """
+    Computes the negtive log-likelihood of tag sequences in a linear chain CRF. 
+    Using terminologies of undirected probabilistic graph model, it calculates
+    probability using unary potentials (for emission) and binary potentials 
+    (for transition). 
 
+    This layer creates a learnable parameter shaped `[size + 2, size]` (`size`
+    is for the number of tags), where:
+    
+    1. the first row is for starting weights, denoted as $a$ here
+    
+    2. the second row is for ending weights, denoted as $b$ here.
+    
+    3. the remaining rows is a matrix for transition weights. 
+    
+    Denote input tensor of unary potentials(emission) as $x$ , then the probability
+    of a tag sequence $s$ of length $L$ is defined as:
+
+    $$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                    + \sum_{l=1}^L x_{s_l}
+                    + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+    
+    where $Z$ is a normalization value so that the sum of $P(s)$ over
+    all possible sequences is 1, and $x$ is the emission feature weight
+    to the linear chain CRF.
+
+    This operator implements the Forward-Backward algorithm for the linear chain
+    CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+    http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
+
+    NOTE:
+
+    1. The feature function for a CRF is made up of the emission features and the
+    transition features. The emission feature weights are NOT computed in
+    this operator. They MUST be computed first before this operator is called.
+
+    2. Because this operator performs global normalization over all possible
+    sequences internally, it expects UNSCALED emission feature weights.
+    Please do not call this op with the emission feature being output of any
+    nonlinear activation.
+
+    3. The 2nd dimension of input(emission) MUST be equal to the tag number.
+
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LinearChainCRF
+
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5))
+            # label: [batch_size, sequence_length, num_tags]
+            # dummy label just for example usage
+            label = fluid.layers.ones((2, 8, 5), dtype='int64')  
+            crf = LinearChainCRF(size=5)
+            cost = crf(emission, label)  # [2, 1]
+    """
+
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
         self._param_attr = param_attr
         self._dtype = dtype
         self._size = size
-        self._is_test = is_test
         self._transition = self.create_parameter(
             attr=self._param_attr,
             shape=[self._size + 2, self._size],
@@ -4046,14 +4163,46 @@ def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
 
     @property
     def weight(self):
+        """
+        getter for transition matrix parameter
+
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         return self._transition
 
     @weight.setter
     def weight(self, value):
+        """
+        setter for transition matrix parameter
+
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         self._transition = value
 
-    def forward(self, input, label, length=None):
+    def forward(self, input, label, length):
+        """
+        Computes the log-likelihood of tag sequences in a linear chain CRF.
+
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            label (Variable): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64.
+            length (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence for correctness.
 
+        Returns:
+            Variable: The negtive log-likelihood of tag sequences. It is a tensor \
+                with shape `[batch_size, 1]` and has float32 or float64 data type.
+        """
         alpha = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
         emission_exps = self._helper.create_variable_for_type_inference(
@@ -4077,18 +4226,58 @@ def forward(self, input, label, length=None):
                 "EmissionExps": [emission_exps],
                 "TransitionExps": transition_exps,
                 "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
+            })
         return log_likelihood
 
 
 class CRFDecoding(Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(CRFDecoding, self).__init__()
+    """
+    CRFDecoding reads the emission feature weights and the transition
+    feature weights learned by the `LinearChainCRF` and performs decoding. 
+    It implements the Viterbi algorithm which is a dynamic programming algorithm 
+    for finding the most likely sequence of hidden states, called the Viterbi path, 
+    that results in a sequence of observed tags.
+
+    The output of this layer changes according to whether `label` is given:
+
+    1. `label` is given:
 
+    This happens in training. This operator is used to co-work with the chunk_eval
+    operator. When `label` is given, it returns tensor with the same shape as 
+    `label` whose values are fixed to be 0, indicating an incorrect prediction,
+    or 1 indicating a tag is correctly predicted. Such an output is the input to
+    chunk_eval operator.
+
+    2. `label` is not given:
+
+    This is the standard decoding process and get the highest scoring sequence
+    of tags.
+
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CRFDecoding
+
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5)) 
+            crf_decoding = CRFDecoding(size=5)
+            cost = crf_decoding(emission)  # [2, 8]
+    """
+
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(CRFDecoding, self).__init__()
         self._dtype = dtype
         self._size = size
-        self._is_test = is_test
         self._param_attr = param_attr
         self._transition = self.create_parameter(
             attr=self._param_attr,
@@ -4097,13 +4286,49 @@ def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
 
     @property
     def weight(self):
+        """
+        getter for transition matrix parameter
+
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         return self._transition
 
     @weight.setter
     def weight(self, value):
+        """
+        setter for transition matrix parameter
+
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         self._transition = value
 
-    def forward(self, input, label=None, length=None):
+    def forward(self, input, length, label=None):
+        """
+        Performs sequence tagging prediction.
+
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            length (Variable): A tensor with shape `[batch_size]`.
+                It stores real length of each sequence for correctness.
+            label (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. Default None.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size, sequence_length]` and \
+                int64 data type. If `label` is None, the tensor has binary values \
+                indicating a correct or incorrect prediction. Otherwise its values \
+                range from 0 to maximum tag number - 1, each element indicates \
+                an index of a predicted tag.
+        """
 
         viterbi_path = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
@@ -4117,12 +4342,15 @@ def forward(self, input, label=None, length=None):
         self._helper.append_op(
             type='crf_decoding',
             inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
+            outputs={"ViterbiPath": [viterbi_path]})
         return viterbi_path
 
 
 class GRUEncoder(Layer):
+    """
+    A multi-layer bidirectional GRU encoder used by SequenceTagging.
+    """
+
     def __init__(self,
                  input_dim,
                  grnn_hidden_dim,
@@ -4179,6 +4407,43 @@ def forward(self, input_feature, h0=None):
 
 
 class SequenceTagging(Layer):
+    """
+    Sequence tagging model using multi-layer bidirectional GRU as backbone and
+    linear chain CRF as output layer.
+
+    Parameters:
+        vocab_size (int): The size of vocabulary.
+        num_labels (int): The number of labels.
+        word_emb_dim (int, optional): The embedding size. Defalut 128
+        grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128
+        emb_learning_rate (int, optional): The partial learning rate for embedding.
+            The actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        crf_learning_rate (int, optional): The partial learning rate for crf. The
+            actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        bigru_num (int, optional): The number of bidirectional GRU layers.
+            Default 2
+        init_bound (float, optional): The range for uniform initializer would
+            be `(-init_bound, init_bound)`. It would be used for all parameters
+            except CRF transition matrix. Default 0.1
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import SequenceTagging
+
+            # word: [batch_size, sequence_length]
+            word = fluid.layers.ones([2, 8])  # dummy input just for example
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
+            outputs = seq_tagger(word, length)
+    """
+
     def __init__(self,
                  vocab_size,
                  num_labels,
@@ -4189,15 +4454,6 @@ def __init__(self,
                  bigru_num=2,
                  init_bound=0.1):
         super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
         self.word_emb_dim = word_emb_dim
         self.vocab_size = vocab_size
         self.num_labels = num_labels
@@ -4244,7 +4500,27 @@ def __init__(self,
 
     def forward(self, word, lengths, target=None):
         """
-        Configure the network
+        Performs sequence tagging. If `target` is None, it is for training and
+        loss would be returned, otherwise it is for inference and returns the
+        predicted tags.
+
+        Parameters:
+            word (Variable): The input sequences to be labeled. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type should
+                be int64.
+            lengths (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence.
+            target (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. It could be None for inference. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \
+                argument `target` is provided, including the most likely sequence \
+                tags, the averaged CRF cost and the sequence lengths, the shapes \
+                are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \
+                and the data types are int64, float32 and int64. Otherwise A \
+                tuple( :code:`(crf_decode, lengths)` ) for inference.
         """
         word_embed = self.word_embedding(word)
         input_feature = word_embed

From 6e96261885e117d3b38fc11a2f43087c48975009 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 12 May 2020 22:57:34 +0800
Subject: [PATCH 10/16] Add unit test for StackedRNNCell.

---
 hapi/tests/test_text.py | 34 ++++++++++++++++++++++++++++++++++
 hapi/text/text.py       |  3 +++
 2 files changed, 37 insertions(+)

diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index 977656c..9e5d8b0 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -567,6 +567,40 @@ def make_inputs(self):
         return inputs
 
 
+class TestStackedRNN(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size, num_layers):
+        cells = [
+            BasicLSTMCell(input_size, hidden_size),
+            BasicLSTMCell(hidden_size, hidden_size)
+        ]
+        stacked_cell = StackedRNNCell(cells)
+        self.lstm = RNN(stacked_cell)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestLSTM(ModuleApiTest):
     def setUp(self):
         shape = (2, 4, 16)
diff --git a/hapi/text/text.py b/hapi/text/text.py
index 860d3e7..b9569bd 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -49,6 +49,8 @@
     'BasicLSTMCell',
     'BasicGRUCell',
     'RNN',
+    'BidirectionalRNN',
+    'StackedRNNCell',
     'StackedLSTMCell',
     'LSTM',
     'BidirectionalLSTM',
@@ -1025,6 +1027,7 @@ class StackedRNNCell(RNNCell):
     """
 
     def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
         self.cells = []
         for i, cell in enumerate(cells):
             self.cells.append(self.add_sublayer("cell_%d" % i, cell))

From 56e2729c2045b35aefb9ce3cb627ee95b9e1a808 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 01:28:12 +0800
Subject: [PATCH 11/16] Remove hapi.text apis' reuse parameter args for
 coverage. test=develop

---
 examples/sentiment_classification/models.py |   5 +-
 hapi/tests/test_text.py                     |  41 -
 hapi/text/__init__.py                       |   3 -
 hapi/text/text.py                           | 902 ++++----------------
 4 files changed, 148 insertions(+), 803 deletions(-)

diff --git a/examples/sentiment_classification/models.py b/examples/sentiment_classification/models.py
index 1816ba4..6332888 100644
--- a/examples/sentiment_classification/models.py
+++ b/examples/sentiment_classification/models.py
@@ -16,8 +16,9 @@
 from paddle.fluid.dygraph.base import to_variable
 import numpy as np
 from hapi.model import Model
-from hapi.text.text import GRUEncoder as BiGRUEncoder
-from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder
+from hapi.text.text import _GRUEncoder as GRUEncoder
+from hapi.text.text import _GRUEncoder as BiGRUEncoder
+from hapi.text.test import BOWEncoder, CNNEncoder
 
 
 class CNN(Model):
diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index 9e5d8b0..bdd7459 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -28,47 +28,6 @@
 from hapi.text.text import *
 
 
-def sigmoid(x):
-    return 1. / (1. + np.exp(-x))
-
-
-def tanh(x):
-    return 2. * sigmoid(2. * x) - 1.
-
-
-def lstm_step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
-
-    new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
-    new_hidden = tanh(new_cell) * sigmoid(o)
-
-    return new_hidden, new_cell
-
-
-def gru_step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    gate_input = sigmoid(gate_input)
-    r, u = np.split(gate_input, indices_or_sections=2, axis=1)
-
-    r_hidden = r * pre_hidden
-
-    candidate = np.matmul(np.concatenate([step_in, r_hidden], 1), candidate_w)
-
-    candidate += candidate_b
-    c = tanh(candidate)
-
-    new_hidden = u * pre_hidden + (1 - u) * c
-
-    return new_hidden
-
-
 class ModuleApiTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
index 80568e3..2cefb4f 100644
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
@@ -37,9 +37,6 @@
 from hapi.text.text import TransformerCell as TransformerCell
 from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
 
-from hapi.text.text import GRUCell as GRUCell
-from hapi.text.text import GRUEncoderCell as GRUEncoderCell
-from hapi.text.text import BiGRU as BiGRU
 from hapi.text.text import LinearChainCRF as LinearChainCRF
 from hapi.text.text import CRFDecoding as CRFDecoding
 from hapi.text.text import SequenceTagging as SequenceTagging
diff --git a/hapi/text/text.py b/hapi/text/text.py
index b9569bd..1fea41e 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -16,33 +16,22 @@
 from __future__ import division
 from __future__ import print_function
 
-import os
+import copy
+import collections
 import six
 import sys
-if six.PY2:
-    reload(sys)
-    sys.setdefaultencoding('utf8')
+from functools import partial, reduce
 
-import ast
-import time
-import argparse as argparse
 import numpy as np
-import multiprocessing
-
-import collections
-import copy
-from functools import partial, reduce
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
-from paddle.fluid.data_feeder import convert_dtype
-
 from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
 from paddle.fluid.layers import BeamSearchDecoder
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
     'RNNCell',
@@ -72,7 +61,6 @@
     'LinearChainCRF',
     'CRFDecoding',
     'SequenceTagging',
-    'GRUEncoder',
 ]
 
 
@@ -234,25 +222,6 @@ class BasicLSTMCell(RNNCell):
         forget_bias(float, optional): forget bias used when computing forget gate.
             Default 1.0
         dtype(string, optional): The data type used in this cell. Default float32.
-        forget_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
-            as keys, and the corresponding values should be instances of Parameter
-            which represent :math:`W_{x_{f}}, W_{h_{f}}, b_{f}` and have shape
-            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
-            separately. It is used for reusing and sharing weights when provided,
-            otherwise create these parameters. Note that parameters from input
-            gate, forget gate and cell would be concatenated in implementation.
-        input_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
-            and the corresponding values should be instances of Parameter which
-            represent :math:`W_{x_{i}}, W_{h_{i}}, b_{i}` separately. It has the
-            same usage as :attr:`forget_gate_weights`.
-        output_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
-            and the corresponding values should be instances of Parameter which
-            represent :math:`W_{x_{o}}, W_{h_{o}}, b_{o}` separately. It has the
-            same usage as :attr:`forget_gate_weights`.
-        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
-            and the corresponding values should be instances of Parameter which
-            represent :math:`W_{x_{c}}, W_{h_{c}}, b_{c}` separately. It has the
-            same usage as :attr:`forget_gate_weights`.
     """
 
     def __init__(self,
@@ -263,19 +232,7 @@ def __init__(self,
                  gate_activation=None,
                  activation=None,
                  forget_bias=1.0,
-                 dtype='float32',
-                 forget_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 input_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 output_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
+                 dtype='float32'):
         super(BasicLSTMCell, self).__init__()
 
         self._hidden_size = hidden_size
@@ -290,225 +247,43 @@ def __init__(self,
         self._dtype = dtype
         self._input_size = input_size
 
-        self.use_customized_weight = False
-        for _weights in [
-                forget_gate_weights, input_gate_weights, output_gate_weights,
-                cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-                    break
-            if self.use_customized_weight:
-                break
-
-        if not self.use_customized_weight:
-
-            self._weight = self.create_parameter(
-                attr=self._param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 4 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            if "w" in forget_gate_weights and forget_gate_weights[
-                    "w"] is not None:
-                self.fg_w = forget_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in forget_gate_weights and forget_gate_weights[
-                    "h"] is not None:
-                self.fg_h = forget_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in forget_gate_weights and forget_gate_weights[
-                    "b"] is not None:
-                self.fg_b = forget_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_forget_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.fg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in input_gate_weights and input_gate_weights[
-                    "w"] is not None:
-                self.ig_w = input_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in input_gate_weights and input_gate_weights[
-                    "h"] is not None:
-                self.ig_h = input_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in input_gate_weights and input_gate_weights[
-                    "b"] is not None:
-                self.ig_b = input_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_input_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.ig_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in output_gate_weights and output_gate_weights[
-                    "w"] is not None:
-                self.og_w = output_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.og_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in output_gate_weights and output_gate_weights[
-                    "h"] is not None:
-                self.og_h = output_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.og_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in output_gate_weights and output_gate_weights[
-                    "b"] is not None:
-                self.og_b = output_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_output_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.og_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            weight_w = fluid.layers.concat(
-                [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
-            weight_h = fluid.layers.concat(
-                [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
-            _weight = fluid.layers.concat([weight_w, weight_h], axis=0)
-            _bias = fluid.layers.concat(
-                [self.ig_b, self.c_b, self.fg_b, self.og_b])
-        else:
-            _weight = self._weight
-            _bias = self._bias
+        self._weight = self.create_parameter(
+            attr=self._param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 4 * self._hidden_size
+            ],
+            dtype=self._dtype)
 
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
+        self._bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[4 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
 
-        gate_input = layers.elementwise_add(gate_input, _bias)
+    def forward(self, inputs, states):
+        """
+        Performs single step LSTM calculations.
+
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A list of containing two tensors, each shaped
+                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
+                in the formula. The data type should be float32 or float64.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula; `new_states` is a list containing \
+                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
+                tensors all is same as that of `states`.
+        """
+        pre_hidden, pre_cell = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.elementwise_add(gate_input, self._bias)
         i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
         new_cell = layers.elementwise_add(
             layers.elementwise_mul(
@@ -564,21 +339,6 @@ class BasicGRUCell(RNNCell):
             GRU, that is :math:`act_c` in the formula. Default: None,
             representing for 'fluid.layers.tanh'.
         dtype(string, optional): The data type used in this cell. Default float32.
-        update_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
-            as keys, and the corresponding values should be instances of Parameter
-            which represent :math:`W_{ux}, W_{uh}, b_{u}` and have shape
-            [input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
-            separately. It is used for reusing and sharing weights when provided,
-            otherwise create these parameters. Note that parameters from update
-            gate and reset gate would be concatenated in implementation.
-        reset_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
-            and the corresponding values should be instances of Parameter which
-            represent :math:`W_{rx}, W_{rh}, b_{r}` separately. It has the
-            same usage as :attr:`update_gate_weights`.
-        cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
-            and the corresponding values should be instances of Parameter which
-            represent :math:`W_{cx}, W_{ch}, b_{c}`` separately. It has the
-            same usage as :attr:`update_gate_weights`.
     """
 
     def __init__(self,
@@ -588,16 +348,7 @@ def __init__(self,
                  bias_attr=None,
                  gate_activation=None,
                  activation=None,
-                 dtype='float32',
-                 update_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 reset_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
+                 dtype='float32'):
         super(BasicGRUCell, self).__init__()
         self._input_size = input_size
         self._hidden_size = hidden_size
@@ -607,20 +358,6 @@ def __init__(self,
         self._activation = activation or layers.tanh
         self._dtype = dtype
 
-        assert isinstance(update_gate_weights, dict)
-        assert isinstance(reset_gate_weights, dict)
-        assert isinstance(cell_weights, dict)
-
-        self.use_customized_weight = False
-        for _weights in [
-                update_gate_weights, reset_gate_weights, cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-            if self.use_customized_weight:
-                break
-
         if self._param_attr is not None and self._param_attr.name is not None:
             gate_param_attr = copy.deepcopy(self._param_attr)
             candidate_param_attr = copy.deepcopy(self._param_attr)
@@ -630,194 +367,62 @@ def __init__(self,
             gate_param_attr = self._param_attr
             candidate_param_attr = self._param_attr
 
-        if not self.use_customized_weight:
-            self._gate_weight = self.create_parameter(
-                attr=gate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 2 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._candidate_weight = self.create_parameter(
-                attr=candidate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                gate_bias_attr = copy.deepcopy(self._bias_attr)
-                candidate_bias_attr = copy.deepcopy(self._bias_attr)
-                gate_bias_attr.name += "_gate"
-                candidate_bias_attr.name += "_candidate"
-            else:
-                gate_bias_attr = self._bias_attr
-                candidate_bias_attr = self._bias_attr
-
-            self._gate_bias = self.create_parameter(
-                attr=gate_bias_attr,
-                shape=[2 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-            self._candidate_bias = self.create_parameter(
-                attr=candidate_bias_attr,
-                shape=[self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
+        self._gate_weight = self.create_parameter(
+            attr=gate_param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 2 * self._hidden_size
+            ],
+            dtype=self._dtype)
 
-        else:
+        self._candidate_weight = self.create_parameter(
+            attr=candidate_param_attr,
+            shape=[self._input_size + self._hidden_size, self._hidden_size],
+            dtype=self._dtype)
 
-            # create the parameters of gates in gru
-            if "w" in update_gate_weights and update_gate_weights[
-                    "w"] is not None:
-                self.ug_w = update_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in update_gate_weights and update_gate_weights[
-                    "h"] is not None:
-                self.ug_h = update_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in update_gate_weights and update_gate_weights[
-                    "b"] is not None:
-                self.ug_b = update_gate_weights["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_update_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.ug_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # reset gate parameters
-            if "w" in reset_gate_weights and reset_gate_weights[
-                    "w"] is not None:
-                self.rg_w = reset_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in reset_gate_weights and reset_gate_weights[
-                    "h"] is not None:
-                self.rg_h = reset_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in reset_gate_weights and reset_gate_weights[
-                    "b"] is not None:
-                self.rg_b = reset_gate_weights["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_reset_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.rg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # cell parameters
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
-            ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
-            _gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
-            _candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
-            _gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
-            _candidate_bias = self.c_b
+        if self._bias_attr is not None and self._bias_attr.name is not None:
+            gate_bias_attr = copy.deepcopy(self._bias_attr)
+            candidate_bias_attr = copy.deepcopy(self._bias_attr)
+            gate_bias_attr.name += "_gate"
+            candidate_bias_attr.name += "_candidate"
         else:
-            _gate_weight = self._gate_weight
-            _gate_bias = self._gate_bias
-            _candidate_weight = self._candidate_weight
-            _candidate_bias = self._candidate_bias
+            gate_bias_attr = self._bias_attr
+            candidate_bias_attr = self._bias_attr
+
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
+            shape=[2 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
+            shape=[self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
 
-        pre_hidden = state
-        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
+    def forward(self, inputs, states):
+        """
+        Performs single step GRU calculations.
 
-        gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
+                corresponding to :math:`h_{t-1}` in the formula. The data type
+                should be float32 or float64.
 
-        gate_input = layers.elementwise_add(gate_input, _gate_bias)
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
+                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
+                corresponding to :math:`h_t` in the formula. The data type of the \
+                tensor is same as that of `states`.        
+        """
+        pre_hidden = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
+
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+
+        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
 
         gate_input = self._gate_activation(gate_input)
         r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -825,8 +430,8 @@ def forward(self, input, state):
         r_hidden = r * pre_hidden
 
         candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), _candidate_weight)
-        candidate = layers.elementwise_add(candidate, _candidate_bias)
+            layers.concat([inputs, r_hidden], 1), self._candidate_weight)
+        candidate = layers.elementwise_add(candidate, self._candidate_bias)
 
         c = self._activation(candidate)
         new_hidden = u * pre_hidden + (1 - u) * c
@@ -2650,6 +2255,7 @@ class TransformerCell(Layer):
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
+                    super(Embedder, self).__init__()
                     self.word_embedder = Embedding(size=[1000, 128])
                     self.pos_embedder = Embedding(size=[500, 128])
 
@@ -2999,11 +2605,7 @@ class PrePostProcessLayer(Layer):
             out = process(x)  # [2, 4, 32]
     """
 
-    def __init__(self,
-                 process_cmd,
-                 d_model,
-                 dropout_rate=0.1,
-                 reused_layer_norm=None):
+    def __init__(self, process_cmd, d_model, dropout_rate=0.1):
         super(PrePostProcessLayer, self).__init__()
         self.process_cmd = process_cmd
         self.functors = []
@@ -3012,15 +2614,12 @@ def __init__(self,
                 self.functors.append(
                     lambda x, y: x + y if y is not None else x)
             elif cmd == "n":  # add layer normalization
-                if reused_layer_norm is not None:
-                    layer_norm = reused_layer_norm
-                else:
-                    layer_norm = LayerNorm(
-                        normalized_shape=d_model,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.)),
-                        bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.)))
+                layer_norm = LayerNorm(
+                    normalized_shape=d_model,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
 
                 self.functors.append(
                     self.add_sublayer(
@@ -3091,16 +2690,7 @@ class MultiHeadAttention(Layer):
             output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.0,
-                 reused_query_fc=None,
-                 reused_key_fc=None,
-                 reused_value_fc=None,
-                 reused_proj_fc=None):
+    def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
 
         super(MultiHeadAttention, self).__init__()
         self.n_head = n_head
@@ -3109,30 +2699,14 @@ def __init__(self,
         self.d_model = d_model
         self.dropout_rate = dropout_rate
 
-        if reused_query_fc is not None:
-            self.q_fc = reused_query_fc
-        else:
-            self.q_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_key_fc is not None:
-            self.k_fc = reused_key_fc
-        else:
-            self.k_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_value_fc is not None:
-            self.v_fc = reused_value_fc
-        else:
-            self.v_fc = Linear(
-                input_dim=d_model,
-                output_dim=d_value * n_head,
-                bias_attr=False)
-        if reused_proj_fc is not None:
-            self.proj_fc = reused_proj_fc
-        else:
-            self.proj_fc = Linear(
-                input_dim=d_value * n_head,
-                output_dim=d_model,
-                bias_attr=False)
+        self.q_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.k_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.v_fc = Linear(
+            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+        self.proj_fc = Linear(
+            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
 
     def _prepare_qkv(self, queries, keys, values, cache=None):
         """
@@ -3322,24 +2896,12 @@ class FFN(Layer):
             out = ffn(x)  # [2, 4, 32]
     """
 
-    def __init__(self,
-                 d_inner_hid,
-                 d_model,
-                 dropout_rate=0.1,
-                 fc1_act="relu",
-                 reused_fc1=None,
-                 reused_fc2=None):
+    def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
         super(FFN, self).__init__()
         self.dropout_rate = dropout_rate
-        if reused_fc1 is not None:
-            self.fc1 = reused_fc1
-        else:
-            self.fc1 = Linear(
-                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        if reused_fc2 is not None:
-            self.fc2 = reused_fc2
-        else:
-            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+        self.fc1 = Linear(
+            input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
 
     def forward(self, x):
         """
@@ -3422,51 +2984,22 @@ def __init__(self,
                  relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
-                 ffn_fc1_act="relu",
-                 reused_pre_selatt_layernorm=None,
-                 reused_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 ffn_fc1_act="relu"):
 
         super(TransformerEncoderLayer, self).__init__()
 
         self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
-            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
-            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       fc1_act=ffn_fc1_act,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
         self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                                                  prepostprocess_dropout)
 
     def forward(self, enc_input, attn_bias=None):
         """
@@ -3667,83 +3200,33 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
-                 reused_pre_selfatt_layernorm=None,
-                 reused_self_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_crossatt_layernorm=None,
-                 reused_cross_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_crossatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 ffn_fc1_act="relu"):
         super(TransformerDecoderLayer, self).__init__()
 
         self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selfatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_self_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_self_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_crossatt_layernorm)
-        self.cross_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_cross_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_cross_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_cross_multihead_att_weights[
-                "reused_proj_fc"])
-        self.postprocesser2 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_crossatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                             attention_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
         self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                                                  prepostprocess_dropout)
 
     def forward(self,
                 dec_input,
@@ -3991,101 +3474,6 @@ def prepare_incremental_cache(self, enc_output):
         } for i in range(self.n_layer)]
 
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUCell(RNNCell):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False):
-        super(GRUCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.fc_layer = Linear(
-            input_size, hidden_size * 3, param_attr=param_attr)
-
-        self.gru_unit = GRUUnit(
-            hidden_size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-    def forward(self, inputs, states):
-        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.fc_layer(inputs)
-        hidden, _, _ = self.gru_unit(x, states)
-        return hidden, hidden
-
-    @property
-    def state_shape(self):
-        return [self.hidden_size]
-
-
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUEncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(GRUEncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.gru_cells = []
-        for i in range(num_layers):
-            self.gru_cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    #BasicGRUCell(
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.UniformInitializer(
-                                low=-init_scale, high=init_scale)))))
-
-    def forward(self, step_input, states):
-        new_states = []
-        for i, gru_cell in enumerate(self.gru_cells):
-            out, state = gru_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout_prob,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout_prob > 0 else out
-            new_states.append(step_input)
-        return step_input, new_states
-
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.gru_cells]
-
-
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                      init_bound),
-                       is_reverse=False,
-                       time_major=False)
-
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                        init_bound),
-                         is_reverse=True,
-                         time_major=False)
-
-    def forward(self, input_feature):
-        pre_gru, pre_state = self.gru(input_feature)
-        gru_r, r_state = self.gru_r(input_feature)
-        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-        return bi_merge
-
-
 class LinearChainCRF(Layer):
     """
     Computes the negtive log-likelihood of tag sequences in a linear chain CRF. 
@@ -4349,7 +3737,7 @@ def forward(self, input, length, label=None):
         return viterbi_path
 
 
-class GRUEncoder(Layer):
+class _GRUEncoder(Layer):
     """
     A multi-layer bidirectional GRU encoder used by SequenceTagging.
     """
@@ -4360,7 +3748,7 @@ def __init__(self,
                  init_bound,
                  num_layers=1,
                  is_bidirection=False):
-        super(GRUEncoder, self).__init__()
+        super(_GRUEncoder, self).__init__()
         self.num_layers = num_layers
         self.is_bidirection = is_bidirection
         self.gru_list = []
@@ -4475,7 +3863,7 @@ def __init__(self,
                 initializer=fluid.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound)))
 
-        self.gru_encoder = GRUEncoder(
+        self.gru_encoder = _GRUEncoder(
             input_dim=self.grnn_hidden_dim,
             grnn_hidden_dim=self.grnn_hidden_dim,
             init_bound=self.init_bound,

From 1f63da126115dd4a2ecbf56981303018cac12ed8 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 02:57:23 +0800
Subject: [PATCH 12/16] Fix TransformerCell and TransformerBeamSearchDecoder
 example codes. test=develop

---
 hapi/text/text.py | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index 1fea41e..a9e23aa 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -2226,7 +2226,7 @@ def forward(self, input):
         return out
 
 
-class TransformerCell(Layer):
+class TransformerCell(RNNCell):
     """
     TransformerCell wraps a Transformer decoder producing logits from `inputs`
     composed by ids and position.
@@ -2249,9 +2249,13 @@ class TransformerCell(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Embedding
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
             from paddle.incubate.hapi.text import TransformerCell
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.enable_dygraph()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
@@ -2259,8 +2263,7 @@ def __init__(self):
                     self.word_embedder = Embedding(size=[1000, 128])
                     self.pos_embedder = Embedding(size=[500, 128])
 
-                def forward(self, inputs):
-                    word, position = inputs
+                def forward(self, word, position):
                     return self.word_embedder(word) + self.pos_embedder(position)
 
             embedder = Embedder()
@@ -2270,18 +2273,18 @@ def forward(self, inputs):
             dynamic_decoder = DynamicDecode(
                 TransformerBeamSearchDecoder(
                     transformer_cell,
-                    bos_id=0,
-                    eos_id=1,
+                    start_token=0,
+                    end_token=1,
                     beam_size=4,
                     var_dim_in_state=2),
-                max_step_num,
+                max_step_num=10,
                 is_test=True)
             
-            enc_output = paddle.rand((2, 4, 64))
+            enc_output = paddle.rand((2, 4, 128))
             # cross attention bias: [batch_size, n_head, trg_len, src_len]
             trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
             # inputs for beam search on Transformer
-            states = cell.get_initial_states(encoder_output)
+            caches = transformer_cell.get_initial_states(enc_output)
             enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
                 enc_output, beam_size=4)
             trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
@@ -2389,7 +2392,7 @@ def state_shape(self):
         return [{
             "k": [self.decoder.n_head, 0, self.decoder.d_key],
             "v": [self.decoder.n_head, 0, self.decoder.d_value],
-        } for i in range(len(self.decoder.n_layer))]
+        } for i in range(self.decoder.n_layer)]
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
@@ -2413,17 +2416,21 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Embedding
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
             from paddle.incubate.hapi.text import TransformerCell
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.enable_dygraph()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
+                    super(Embedder, self).__init__()
                     self.word_embedder = Embedding(size=[1000, 128])
                     self.pos_embedder = Embedding(size=[500, 128])
 
-                def forward(self, inputs):
-                    word, position = inputs
+                def forward(self, word, position):
                     return self.word_embedder(word) + self.pos_embedder(position)
 
             embedder = Embedder()
@@ -2433,18 +2440,18 @@ def forward(self, inputs):
             dynamic_decoder = DynamicDecode(
                 TransformerBeamSearchDecoder(
                     transformer_cell,
-                    bos_id=0,
-                    eos_id=1,
+                    start_token=0,
+                    end_token=1,
                     beam_size=4,
                     var_dim_in_state=2),
-                max_step_num,
+                max_step_num=10,
                 is_test=True)
             
             enc_output = paddle.rand((2, 4, 128))
             # cross attention bias: [batch_size, n_head, trg_len, src_len]
             trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
             # inputs for beam search on Transformer
-            states = cell.get_initial_states(encoder_output)
+            caches = transformer_cell.get_initial_states(enc_output)
             enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
                 enc_output, beam_size=4)
             trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(

From a13654b9be3fcb4d774b3cf135ce1f3d5b0819e3 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 03:58:49 +0800
Subject: [PATCH 13/16] Fix example codes in hapi.text. test=develop

---
 hapi/text/text.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index a9e23aa..2eef453 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -474,7 +474,7 @@ class RNN(Layer):
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell, inputs=inputs)
+            rnn = RNN(cell=cell)
             outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
@@ -771,7 +771,7 @@ class StackedLSTMCell(RNNCell):
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell, inputs=inputs)
+            rnn = RNN(cell=cell)
             outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
@@ -1001,7 +1001,7 @@ class BidirectionalRNN(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
 
             inputs = paddle.rand((2, 4, 32))
             cell_fw = StackedLSTMCell(32, 64)
@@ -1362,11 +1362,11 @@ class StackedGRUCell(RNNCell):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            from paddle.incubate.hapi.text import StackedGRUCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedGRUCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell, inputs=inputs)
+            rnn = RNN(cell=cell)
             outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
@@ -1502,7 +1502,7 @@ class GRU(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import LSTM
+            from paddle.incubate.hapi.text import GRU
 
             inputs = paddle.rand((2, 4, 32))
             gru = GRU(input_size=32, hidden_size=64, num_layers=2)
@@ -1625,7 +1625,7 @@ class BidirectionalGRU(Layer):
             from paddle.incubate.hapi.text import BidirectionalGRU
 
             inputs = paddle.rand((2, 4, 32))
-            gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
+            bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
             outputs, _ = bi_gru(inputs)  # [2, 4, 128]
     """
 
@@ -1779,6 +1779,7 @@ class DynamicDecode(Layer):
 
             import paddle
             import paddle.fluid as fluid
+            from paddle.fluid.layers import BeamSearchDecoder
             from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
 
             vocab_size, d_model, = 100, 32
@@ -2693,7 +2694,7 @@ class MultiHeadAttention(Layer):
             query = paddle.rand((2, 4, 128))
             # self attention bias: [batch_size, n_head, src_len, src_len]
             attn_bias = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = MultiHeadAttention(64, 64, 2, 128)
+            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
             output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
     """
 
@@ -2976,8 +2977,8 @@ class TransformerEncoderLayer(Layer):
             enc_input = paddle.rand((2, 4, 128))
             # self attention bias: [batch_size, n_head, src_len, src_len]
             attn_bias = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(2, 2, 64, 64, 128, 512)
-            enc_output = encoder_layer(inputs, attn_bias)  # [2, 4, 128]
+            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3080,7 +3081,7 @@ class TransformerEncoder(Layer):
             # self attention bias: [batch_size, n_head, src_len, src_len]
             attn_bias = paddle.rand((2, 2, 4, 4))
             encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
-            enc_output = encoder(inputs, attn_bias)  # [2, 4, 128]
+            enc_output = encoder(enc_input, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -3536,6 +3537,7 @@ class LinearChainCRF(Layer):
 
         .. code-block:: python
 
+            import numpy as np
             import paddle
             import paddle.fluid as fluid
             from paddle.incubate.hapi.text import LinearChainCRF
@@ -3544,9 +3546,10 @@ class LinearChainCRF(Layer):
             emission = paddle.rand((2, 8, 5))
             # label: [batch_size, sequence_length, num_tags]
             # dummy label just for example usage
-            label = fluid.layers.ones((2, 8, 5), dtype='int64')  
+            label = paddle.ones((2, 8), dtype='int64')  
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
             crf = LinearChainCRF(size=5)
-            cost = crf(emission, label)  # [2, 1]
+            cost = crf(emission, label, length)  # [2, 1]
     """
 
     def __init__(self, size, param_attr=None, dtype='float32'):
@@ -3667,9 +3670,10 @@ class CRFDecoding(Layer):
             from paddle.incubate.hapi.text import CRFDecoding
 
             # emission: [batch_size, sequence_length, num_tags]
-            emission = paddle.rand((2, 8, 5)) 
+            emission = paddle.rand((2, 8, 5))
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
             crf_decoding = CRFDecoding(size=5)
-            cost = crf_decoding(emission)  # [2, 8]
+            cost = crf_decoding(emission, length)  # [2, 8]
     """
 
     def __init__(self, size, param_attr=None, dtype='float32'):
@@ -3836,7 +3840,8 @@ class SequenceTagging(Layer):
             from paddle.incubate.hapi.text import SequenceTagging
 
             # word: [batch_size, sequence_length]
-            word = fluid.layers.ones([2, 8])  # dummy input just for example
+            # dummy input just for example
+            word = paddle.ones((2, 8), dtype='int64')
             length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
             seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
             outputs = seq_tagger(word, length)

From f1b6a68bf0e7f40e6f2d1ff5cf1da22c0910e3a0 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 04:23:32 +0800
Subject: [PATCH 14/16] Add some apis in hapi.text into example code white
 list. test=develop

---
 hapi/text/text.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index 2eef453..97803cb 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -222,6 +222,19 @@ class BasicLSTMCell(RNNCell):
         forget_bias(float, optional): forget bias used when computing forget gate.
             Default 1.0
         dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
@@ -339,6 +352,19 @@ class BasicGRUCell(RNNCell):
             GRU, that is :math:`act_c` in the formula. Default: None,
             representing for 'fluid.layers.tanh'.
         dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicGRUCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
@@ -1787,7 +1813,7 @@ class DynamicDecode(Layer):
             trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
             output_layer = fluid.dygraph.Linear(d_model, vocab_size)
             cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
-            decoder = BeamSearchDecoder(decoder_cell,
+            decoder = BeamSearchDecoder(cell,
                                         start_token=0,
                                         end_token=1,
                                         beam_size=4,
@@ -3665,6 +3691,7 @@ class CRFDecoding(Layer):
 
         .. code-block:: python
 
+            import numpy as np
             import paddle
             import paddle.fluid as fluid
             from paddle.incubate.hapi.text import CRFDecoding

From 503d40a7734b9a655d95e3548c5b18d0aeaf8403 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 10:28:36 +0800
Subject: [PATCH 15/16] Fix example code of DynamicDecode in hapi.text.
 text=develop

---
 hapi/text/text.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hapi/text/text.py b/hapi/text/text.py
index 97803cb..de392f5 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -1808,6 +1808,8 @@ class DynamicDecode(Layer):
             from paddle.fluid.layers import BeamSearchDecoder
             from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
 
+            paddle.enable_dygraph()
+
             vocab_size, d_model, = 100, 32
             encoder_output = paddle.rand((2, 4, d_model))
             trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])

From 837eff99c6df671b6ff3a70f2b86a4776af5e35f Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 May 2020 14:09:05 +0800
Subject: [PATCH 16/16] Rename Model.self as model in test_text.py test=develop

---
 hapi/tests/test_text.py | 126 ++++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index bdd7459..f74d631 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -56,13 +56,13 @@ def __impl__(self, *args, **kwargs):
         return __impl__
 
     @staticmethod
-    def model_init(self, *args, **kwargs):
+    def model_init(model, *args, **kwargs):
         raise NotImplementedError(
             "model_init acts as `Model.__init__`, thus must implement it")
 
     @staticmethod
-    def model_forward(self, *args, **kwargs):
-        return self.module(*args, **kwargs)
+    def model_forward(model, *args, **kwargs):
+        return model.module(*args, **kwargs)
 
     def make_inputs(self):
         # TODO(guosheng): add default from `self.inputs`
@@ -118,7 +118,7 @@ def check_output(self):
 
 class TestBasicLSTM(ModuleApiTest):
     def setUp(self):
-        # TODO(guosheng): Change to big size. Currentlys bigger hidden size for
+        # TODO(guosheng): Change to big size. Currently bigger hidden size for
         # LSTM would fail, the second static graph run might get diff output
         # with others.
         shape = (2, 4, 16)
@@ -128,8 +128,8 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size):
-        self.lstm = RNN(
+    def model_init(model, input_size, hidden_size):
+        model.lstm = RNN(
             BasicLSTMCell(
                 input_size,
                 hidden_size,
@@ -137,8 +137,8 @@ def model_init(self, input_size, hidden_size):
                 bias_attr=fluid.ParamAttr(name="lstm_bias")))
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.lstm(inputs)[0]
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -162,12 +162,12 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size):
-        self.gru = RNN(BasicGRUCell(input_size, hidden_size))
+    def model_init(model, input_size, hidden_size):
+        model.gru = RNN(BasicGRUCell(input_size, hidden_size))
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.gru(inputs)[0]
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -220,8 +220,8 @@ def model_init(self,
             decoder, max_step_num=max_step_num, is_test=True)
 
     @staticmethod
-    def model_forward(self, init_hidden, init_cell):
-        return self.beam_search_decoder([init_hidden, init_cell])[0]
+    def model_forward(model, init_hidden, init_cell):
+        return model.beam_search_decoder([init_hidden, init_cell])[0]
 
     def make_inputs(self):
         inputs = [
@@ -258,7 +258,7 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    n_layer,
                    n_head,
                    d_key,
@@ -271,14 +271,14 @@ def model_init(self,
                    preprocess_cmd="n",
                    postprocess_cmd="da",
                    ffn_fc1_act="relu"):
-        self.encoder = TransformerEncoder(
+        model.encoder = TransformerEncoder(
             n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
             prepostprocess_dropout, attention_dropout, relu_dropout,
             preprocess_cmd, postprocess_cmd, ffn_fc1_act)
 
     @staticmethod
-    def model_forward(self, enc_input, attn_bias):
-        return self.encoder(enc_input, attn_bias)
+    def model_forward(model, enc_input, attn_bias):
+        return model.encoder(enc_input, attn_bias)
 
     def make_inputs(self):
         inputs = [
@@ -321,7 +321,7 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    n_layer,
                    n_head,
                    d_key,
@@ -333,20 +333,20 @@ def model_init(self,
                    relu_dropout=0.1,
                    preprocess_cmd="n",
                    postprocess_cmd="da"):
-        self.decoder = TransformerDecoder(
+        model.decoder = TransformerDecoder(
             n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
             prepostprocess_dropout, attention_dropout, relu_dropout,
             preprocess_cmd, postprocess_cmd)
 
     @staticmethod
-    def model_forward(self,
+    def model_forward(model,
                       dec_input,
                       enc_output,
                       self_attn_bias,
                       cross_attn_bias,
                       caches=None):
-        return self.decoder(dec_input, enc_output, self_attn_bias,
-                            cross_attn_bias, caches)
+        return model.decoder(dec_input, enc_output, self_attn_bias,
+                             cross_attn_bias, caches)
 
     def make_inputs(self):
         inputs = [
@@ -394,7 +394,7 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    vocab_size,
                    n_layer,
                    n_head,
@@ -411,7 +411,7 @@ def model_init(self,
                    eos_id=1,
                    beam_size=4,
                    max_step_num=20):
-        self.beam_size = beam_size
+        model.beam_size = beam_size
 
         def embeder_init(self, size):
             Layer.__init__(self)
@@ -423,13 +423,13 @@ def embeder_init(self, size):
         })
         embedder = Embedder(size=[vocab_size, d_model])
         output_layer = Linear(d_model, vocab_size)
-        self.decoder = TransformerDecoder(
+        model.decoder = TransformerDecoder(
             n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
             prepostprocess_dropout, attention_dropout, relu_dropout,
             preprocess_cmd, postprocess_cmd)
-        transformer_cell = TransformerCell(self.decoder, embedder,
+        transformer_cell = TransformerCell(model.decoder, embedder,
                                            output_layer)
-        self.beam_search_decoder = DynamicDecode(
+        model.beam_search_decoder = DynamicDecode(
             TransformerBeamSearchDecoder(
                 transformer_cell,
                 bos_id,
@@ -440,14 +440,14 @@ def embeder_init(self, size):
             is_test=True)
 
     @staticmethod
-    def model_forward(self, enc_output, trg_src_attn_bias):
-        caches = self.decoder.prepare_incremental_cache(enc_output)
+    def model_forward(model, enc_output, trg_src_attn_bias):
+        caches = model.decoder.prepare_incremental_cache(enc_output)
         enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-            enc_output, self.beam_size)
+            enc_output, model.beam_size)
         trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-            trg_src_attn_bias, self.beam_size)
-        static_caches = self.decoder.prepare_static_cache(enc_output)
-        rs, _ = self.beam_search_decoder(
+            trg_src_attn_bias, model.beam_size)
+        static_caches = model.decoder.prepare_static_cache(enc_output)
+        rs, _ = model.beam_search_decoder(
             inits=caches,
             enc_output=enc_output,
             trg_src_attn_bias=trg_src_attn_bias,
@@ -483,7 +483,7 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    vocab_size,
                    num_labels,
                    word_emb_dim=128,
@@ -492,13 +492,13 @@ def model_init(self,
                    crf_learning_rate=0.1,
                    bigru_num=2,
                    init_bound=0.1):
-        self.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim,
-                                      grnn_hidden_dim, emb_learning_rate,
-                                      crf_learning_rate, bigru_num, init_bound)
+        model.tagger = SequenceTagging(
+            vocab_size, num_labels, word_emb_dim, grnn_hidden_dim,
+            emb_learning_rate, crf_learning_rate, bigru_num, init_bound)
 
     @staticmethod
-    def model_forward(self, word, lengths, target=None):
-        return self.tagger(word, lengths, target)
+    def model_forward(model, word, lengths, target=None):
+        return model.tagger(word, lengths, target)
 
     def make_inputs(self):
         inputs = [
@@ -535,13 +535,13 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size, num_layers):
+    def model_init(model, input_size, hidden_size, num_layers):
         cells = [
             BasicLSTMCell(input_size, hidden_size),
             BasicLSTMCell(hidden_size, hidden_size)
         ]
         stacked_cell = StackedRNNCell(cells)
-        self.lstm = RNN(stacked_cell)
+        model.lstm = RNN(stacked_cell)
 
     @staticmethod
     def model_forward(self, inputs):
@@ -569,12 +569,12 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size, num_layers):
-        self.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.lstm(inputs)[0]
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -598,13 +598,13 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    input_size,
                    hidden_size,
                    num_layers,
                    merge_mode="concat",
                    merge_each_layer=False):
-        self.bilstm = BidirectionalLSTM(
+        model.bilstm = BidirectionalLSTM(
             input_size,
             hidden_size,
             num_layers=num_layers,
@@ -612,8 +612,8 @@ def model_init(self,
             merge_each_layer=merge_each_layer)
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.bilstm(inputs)[0]
+    def model_forward(model, inputs):
+        return model.bilstm(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -641,12 +641,12 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, input_size, hidden_size, num_layers):
-        self.gru = GRU(input_size, hidden_size, num_layers=num_layers)
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.gru = GRU(input_size, hidden_size, num_layers=num_layers)
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.gru(inputs)[0]
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -670,13 +670,13 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self,
+    def model_init(model,
                    input_size,
                    hidden_size,
                    num_layers,
                    merge_mode="concat",
                    merge_each_layer=False):
-        self.bigru = BidirectionalGRU(
+        model.bigru = BidirectionalGRU(
             input_size,
             hidden_size,
             num_layers=num_layers,
@@ -684,8 +684,8 @@ def model_init(self,
             merge_each_layer=merge_each_layer)
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.bigru(inputs)[0]
+    def model_forward(model, inputs):
+        return model.bigru(inputs)[0]
 
     def make_inputs(self):
         inputs = [
@@ -713,8 +713,8 @@ def setUp(self):
         self.param_states = {}
 
     @staticmethod
-    def model_init(self, num_channels, num_filters, num_layers):
-        self.cnn_encoder = CNNEncoder(
+    def model_init(model, num_channels, num_filters, num_layers):
+        model.cnn_encoder = CNNEncoder(
             num_layers=2,
             num_channels=num_channels,
             num_filters=num_filters,
@@ -722,8 +722,8 @@ def model_init(self, num_channels, num_filters, num_layers):
             pool_size=[7, 6])
 
     @staticmethod
-    def model_forward(self, inputs):
-        return self.cnn_encoder(inputs)
+    def model_forward(model, inputs):
+        return model.cnn_encoder(inputs)
 
     def make_inputs(self):
         inputs = [
@@ -734,7 +734,7 @@ def make_inputs(self):
         ]
         return inputs
 
-    def test_check_output_merge0(self):
+    def test_check_output(self):
         self.check_output()