diff --git a/examples/sentiment_classification/models.py b/examples/sentiment_classification/models.py
index 313b928..6332888 100644
--- a/examples/sentiment_classification/models.py
+++ b/examples/sentiment_classification/models.py
@@ -16,12 +16,13 @@
 from paddle.fluid.dygraph.base import to_variable
 import numpy as np
 from hapi.model import Model
-from hapi.text.text import GRUEncoderLayer as BiGRUEncoder
-from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder
+from hapi.text.text import _GRUEncoder as GRUEncoder
+from hapi.text.text import _GRUEncoder as BiGRUEncoder
+from hapi.text.test import BOWEncoder, CNNEncoder
 
 
 class CNN(Model):
-    def __init__(self,  dict_dim, batch_size, seq_len):
+    def __init__(self, dict_dim, batch_size, seq_len):
         super(CNN, self).__init__()
         self.dict_dim = dict_dim
         self.emb_dim = 128
@@ -36,15 +37,19 @@ def __init__(self,  dict_dim, batch_size, seq_len):
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
             seq_len=self.seq_len,
-            filter_size= self.win_size,
-            num_filters= self.hid_dim,
-            hidden_dim= self.hid_dim,
+            filter_size=self.win_size,
+            num_filters=self.hid_dim,
+            hidden_dim=self.hid_dim,
             padding_idx=None,
             act='tanh')
-        self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim * self.seq_len,
+            output_dim=self.fc_hid_dim,
+            act="softmax")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         conv_3 = self._encoder(inputs)
@@ -69,11 +74,14 @@ def __init__(self, dict_dim, batch_size, seq_len):
             padding_idx=None,
             bow_dim=self.hid_dim,
             seq_len=self.seq_len)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh")
-        self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         bow_1 = self._encoder(inputs)
@@ -94,10 +102,12 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = GRUEncoder(
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
@@ -112,7 +122,7 @@ def forward(self, inputs):
         prediction = self._fc_prediction(fc_1)
         return prediction
 
-        
+
 class BiGRU(Model):
     def __init__(self, dict_dim, batch_size, seq_len):
         super(BiGRU, self).__init__()
@@ -130,11 +140,13 @@ def __init__(self, dict_dim, batch_size, seq_len):
             is_sparse=False)
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3)
-        self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = BiGRUEncoder(
             grnn_hidden_dim=self.hid_dim,
             input_dim=self.hid_dim * 3,
@@ -144,7 +156,8 @@ def __init__(self, dict_dim, batch_size, seq_len):
 
     def forward(self, inputs):
         emb = self.embedding(inputs)
-        emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim])
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
         encoded_vector = self._encoder(fc_1)
         encoded_vector = fluid.layers.tanh(encoded_vector)
diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py
index 30bb931..179dc17 100644
--- a/examples/transformer/transformer.py
+++ b/examples/transformer/transformer.py
@@ -18,7 +18,7 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from hapi.model import Model, CrossEntropy, Loss
 from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
@@ -606,17 +606,18 @@ def forward(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
         return predict
 
 
-class TransfomerCell(object):
+class TransformerCell(Layer):
     """
     Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
     used as RNNCell
     """
 
     def __init__(self, decoder):
+        super(TransformerCell, self).__init__()
         self.decoder = decoder
 
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
         trg_word, trg_pos = inputs
         for cache, static_cache in zip(states, static_caches):
             cache.update(static_cache)
@@ -657,7 +658,7 @@ def __init__(self,
         self.beam_size = args.pop("beam_size")
         self.max_out_len = args.pop("max_out_len")
         super(InferTransformer, self).__init__(**args)
-        cell = TransfomerCell(self.decoder)
+        cell = TransformerCell(self.decoder)
         self.beam_search_decoder = DynamicDecode(
             TransformerBeamSearchDecoder(
                 cell, bos_id, eos_id, beam_size, var_dim_in_state=2),
diff --git a/hapi/model.py b/hapi/model.py
index 8c1c521..d825d5c 100644
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -38,7 +38,7 @@
 from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
 from hapi.metrics import Metric
 from hapi.callbacks import config_callbacks
-from hapi.utils import to_list, to_numpy, flatten_list, restore_flatten_list
+from hapi.utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
 
 __all__ = [
     'Model',
@@ -495,14 +495,15 @@ def train_batch(self, inputs, labels=None):
         if labels is not None:
             labels = [to_variable(l) for l in to_list(labels)]
         if self._nranks > 1:
-            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
+            outputs = self.ddp_model.forward(
+                * [to_variable(x) for x in inputs])
             losses = self.model._loss_function(outputs, labels)
             final_loss = fluid.layers.sum(losses)
             final_loss = self.ddp_model.scale_loss(final_loss)
             final_loss.backward()
             self.ddp_model.apply_collective_grads()
         else:
-            outputs = self.model.forward(*[to_variable(x) for x in inputs])
+            outputs = self.model.forward(* [to_variable(x) for x in inputs])
             losses = self.model._loss_function(outputs, labels)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
@@ -511,9 +512,9 @@ def train_batch(self, inputs, labels=None):
         self.model.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+                labels)))
+            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         return ([to_numpy(l) for l in losses], metrics) \
@@ -525,7 +526,7 @@ def eval_batch(self, inputs, labels=None):
         inputs = to_list(inputs)
         if labels is not None:
             labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(*[to_variable(x) for x in inputs])
+        outputs = self.model.forward(* [to_variable(x) for x in inputs])
         if self.model._loss_function:
             losses = self.model._loss_function(outputs, labels)
         else:
@@ -551,9 +552,9 @@ def eval_batch(self, inputs, labels=None):
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
 
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+                labels)))
+            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         # To be consistent with static graph
diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
new file mode 100644
index 0000000..f74d631
--- /dev/null
+++ b/hapi/tests/test_text.py
@@ -0,0 +1,742 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+import time
+import random
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, Linear, Layer
+from paddle.fluid.layers import BeamSearchDecoder
+import hapi.text as text
+from hapi.model import Model, Input, set_device
+from hapi.text.text import *
+
+
+class ModuleApiTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        cls._random_seed = 123
+        np.random.seed(cls._random_seed)
+        random.seed(cls._random_seed)
+
+        cls.model_cls = type(cls.__name__ + "Model", (Model, ), {
+            "__init__": cls.model_init_wrapper(cls.model_init),
+            "forward": cls.model_forward
+        })
+
+    @classmethod
+    def tearDownClass(cls):
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+    @staticmethod
+    def model_init_wrapper(func):
+        def __impl__(self, *args, **kwargs):
+            Model.__init__(self)
+            func(self, *args, **kwargs)
+
+        return __impl__
+
+    @staticmethod
+    def model_init(model, *args, **kwargs):
+        raise NotImplementedError(
+            "model_init acts as `Model.__init__`, thus must implement it")
+
+    @staticmethod
+    def model_forward(model, *args, **kwargs):
+        return model.module(*args, **kwargs)
+
+    def make_inputs(self):
+        # TODO(guosheng): add default from `self.inputs`
+        raise NotImplementedError(
+            "model_inputs makes inputs for model, thus must implement it")
+
+    def setUp(self):
+        """
+        For the model which wraps the module to be tested:
+            Set input data by `self.inputs` list
+            Set init argument values by `self.attrs` list/dict
+            Set model parameter values by `self.param_states` dict
+            Set expected output data by `self.outputs` list
+        We can create a model instance and run once with these.
+        """
+        self.inputs = []
+        self.attrs = {}
+        self.param_states = {}
+        self.outputs = []
+
+    def _calc_output(self, place, mode="test", dygraph=True):
+        if dygraph:
+            fluid.enable_dygraph(place)
+        else:
+            fluid.disable_dygraph()
+        fluid.default_main_program().random_seed = self._random_seed
+        fluid.default_startup_program().random_seed = self._random_seed
+        model = self.model_cls(**self.attrs) if isinstance(
+            self.attrs, dict) else self.model_cls(*self.attrs)
+        model.prepare(inputs=self.make_inputs(), device=place)
+        if self.param_states:
+            model.load(self.param_states, optim_state=None)
+        return model.test_batch(self.inputs)
+
+    def check_output_with_place(self, place, mode="test"):
+        dygraph_output = self._calc_output(place, mode, dygraph=True)
+        stgraph_output = self._calc_output(place, mode, dygraph=False)
+        expect_output = getattr(self, "outputs", None)
+        for actual_t, expect_t in zip(dygraph_output, stgraph_output):
+            self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
+        if expect_output:
+            for actual_t, expect_t in zip(dygraph_output, expect_output):
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, rtol=1e-5, atol=0))
+
+    def check_output(self):
+        devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
+        for device in devices:
+            place = set_device(device)
+            self.check_output_with_place(place)
+
+
+class TestBasicLSTM(ModuleApiTest):
+    def setUp(self):
+        # TODO(guosheng): Change to big size. Currently bigger hidden size for
+        # LSTM would fail, the second static graph run might get diff output
+        # with others.
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, input_size, hidden_size):
+        model.lstm = RNN(
+            BasicLSTMCell(
+                input_size,
+                hidden_size,
+                param_attr=fluid.ParamAttr(name="lstm_weight"),
+                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBasicGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 128)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, input_size, hidden_size):
+        model.gru = RNN(BasicGRUCell(input_size, hidden_size))
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBeamSearch(ModuleApiTest):
+    def setUp(self):
+        shape = (8, 32)
+        self.inputs = [
+            np.random.random(shape).astype("float32"),
+            np.random.random(shape).astype("float32")
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "embed_dim": 32,
+            "hidden_size": 32,
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   embed_dim,
+                   hidden_size,
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        embedder = Embedding(size=[vocab_size, embed_dim])
+        output_layer = Linear(hidden_size, vocab_size)
+        cell = BasicLSTMCell(embed_dim, hidden_size)
+        decoder = BeamSearchDecoder(
+            cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=embedder,
+            output_fn=output_layer)
+        self.beam_search_decoder = DynamicDecode(
+            decoder, max_step_num=max_step_num, is_test=True)
+
+    @staticmethod
+    def model_forward(model, init_hidden, init_cell):
+        return model.beam_search_decoder([init_hidden, init_cell])[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[0].shape[-1]],
+                "float32",
+                name="init_hidden"),
+            Input(
+                [None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerEncoder(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            # encoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   ffn_fc1_act="relu"):
+        model.encoder = TransformerEncoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd, ffn_fc1_act)
+
+    @staticmethod
+    def model_forward(model, enc_input, attn_bias):
+        return model.encoder(enc_input, attn_bias)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_input"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="attn_bias"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerDecoder(TestTransformerEncoder):
+    def setUp(self):
+        self.inputs = [
+            # decoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9,
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da"):
+        model.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+
+    @staticmethod
+    def model_forward(model,
+                      dec_input,
+                      enc_output,
+                      self_attn_bias,
+                      cross_attn_bias,
+                      caches=None):
+        return model.decoder(dec_input, enc_output, self_attn_bias,
+                             cross_attn_bias, caches)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="dec_input"),
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="self_attn_bias"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="cross_attn_bias"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransformerBeamSearchDecoder(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 128]).astype("float32"),
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "n_layer": 2,
+            "n_head": 2,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 128,
+            "d_inner_hid": 128
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   vocab_size,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        model.beam_size = beam_size
+
+        def embeder_init(self, size):
+            Layer.__init__(self)
+            self.embedder = Embedding(size)
+
+        Embedder = type("Embedder", (Layer, ), {
+            "__init__": embeder_init,
+            "forward": lambda self, word, pos: self.embedder(word)
+        })
+        embedder = Embedder(size=[vocab_size, d_model])
+        output_layer = Linear(d_model, vocab_size)
+        model.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+        transformer_cell = TransformerCell(model.decoder, embedder,
+                                           output_layer)
+        model.beam_search_decoder = DynamicDecode(
+            TransformerBeamSearchDecoder(
+                transformer_cell,
+                bos_id,
+                eos_id,
+                beam_size,
+                var_dim_in_state=2),
+            max_step_num,
+            is_test=True)
+
+    @staticmethod
+    def model_forward(model, enc_output, trg_src_attn_bias):
+        caches = model.decoder.prepare_incremental_cache(enc_output)
+        enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            enc_output, model.beam_size)
+        trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            trg_src_attn_bias, model.beam_size)
+        static_caches = model.decoder.prepare_static_cache(enc_output)
+        rs, _ = model.beam_search_decoder(
+            inits=caches,
+            enc_output=enc_output,
+            trg_src_attn_bias=trg_src_attn_bias,
+            static_caches=static_caches)
+        return rs
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="trg_src_attn_bias"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceTagging(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            np.random.randint(0, 100, (2, 8)).astype("int64"),
+            np.random.randint(1, 8, (2)).astype("int64"),
+            np.random.randint(0, 5, (2, 8)).astype("int64")
+        ]
+        self.outputs = None
+        self.attrs = {"vocab_size": 100, "num_labels": 5}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   vocab_size,
+                   num_labels,
+                   word_emb_dim=128,
+                   grnn_hidden_dim=128,
+                   emb_learning_rate=0.1,
+                   crf_learning_rate=0.1,
+                   bigru_num=2,
+                   init_bound=0.1):
+        model.tagger = SequenceTagging(
+            vocab_size, num_labels, word_emb_dim, grnn_hidden_dim,
+            emb_learning_rate, crf_learning_rate, bigru_num, init_bound)
+
+    @staticmethod
+    def model_forward(model, word, lengths, target=None):
+        return model.tagger(word, lengths, target)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None], "int64", name="word"),
+            Input(
+                [None], "int64", name="lengths"),
+            Input(
+                [None, None], "int64", name="target"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceTaggingInfer(TestSequenceTagging):
+    def setUp(self):
+        super(TestSequenceTaggingInfer, self).setUp()
+        self.inputs = self.inputs[:2]  # remove target
+
+    def make_inputs(self):
+        inputs = super(TestSequenceTaggingInfer,
+                       self).make_inputs()[:2]  # remove target
+        return inputs
+
+
+class TestStackedRNN(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        cells = [
+            BasicLSTMCell(input_size, hidden_size),
+            BasicLSTMCell(hidden_size, hidden_size)
+        ]
+        stacked_cell = StackedRNNCell(cells)
+        model.lstm = RNN(stacked_cell)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBiLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        model.bilstm = BidirectionalLSTM(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.bilstm(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output_merge0(self):
+        self.check_output()
+
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+
+
+class TestGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.gru = GRU(input_size, hidden_size, num_layers=num_layers)
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBiGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        model.bigru = BidirectionalGRU(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.bigru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output_merge0(self):
+        self.check_output()
+
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+
+
+class TestCNNEncoder(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 32, 8)  # [N, C, H]
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(model, num_channels, num_filters, num_layers):
+        model.cnn_encoder = CNNEncoder(
+            num_layers=2,
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[2, 3],
+            pool_size=[7, 6])
+
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.cnn_encoder(inputs)
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[-1].shape[1], None],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
index 2177ada..2cefb4f 100644
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,18 +16,27 @@
 from hapi.text.text import BasicLSTMCell as BasicLSTMCell
 from hapi.text.text import BasicGRUCell as BasicGRUCell
 from hapi.text.text import RNN as RNN
+from hapi.text.text import StackedLSTMCell as StackedLSTMCell
+from hapi.text.text import LSTM as LSTM
+from hapi.text.text import BidirectionalLSTM as BidirectionalLSTM
+from hapi.text.text import StackedGRUCell as StackedGRUCell
+from hapi.text.text import GRU as GRU
+from hapi.text.text import BidirectionalGRU as BidirectionalGRU
 from hapi.text.text import DynamicDecode as DynamicDecode
 from hapi.text.text import BeamSearchDecoder as BeamSearchDecoder
+
+from hapi.text.text import Conv1dPoolLayer as Conv1dPoolLayer
+from hapi.text.text import CNNEncoder as CNNEncoder
+
 from hapi.text.text import MultiHeadAttention as MultiHeadAttention
 from hapi.text.text import FFN as FFN
 from hapi.text.text import TransformerEncoderLayer as TransformerEncoderLayer
 from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
 from hapi.text.text import TransformerEncoder as TransformerEncoder
 from hapi.text.text import TransformerDecoder as TransformerDecoder
+from hapi.text.text import TransformerCell as TransformerCell
 from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
-from hapi.text.text import GRUCell as GRUCell
-from hapi.text.text import GRUEncoderCell as GRUEncoderCell
-from hapi.text.text import BiGRU as BiGRU
-from hapi.text.text import Linear_chain_crf as Linear_chain_crf
-from hapi.text.text import Crf_decoding as Crf_decoding
+
+from hapi.text.text import LinearChainCRF as LinearChainCRF
+from hapi.text.text import CRFDecoding as CRFDecoding
 from hapi.text.text import SequenceTagging as SequenceTagging
diff --git a/hapi/text/text.py b/hapi/text/text.py
index ed803ae..de392f5 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -16,44 +16,61 @@
 from __future__ import division
 from __future__ import print_function
 
-import os
+import copy
+import collections
 import six
 import sys
-if six.PY2:
-    reload(sys)
-    sys.setdefaultencoding('utf8')
+from functools import partial, reduce
 
-import ast
-import time
-import argparse as argparse
 import numpy as np
-import multiprocessing
-
-import collections
-import copy
-from functools import partial, reduce
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
-from paddle.fluid.data_feeder import convert_dtype
-
 from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
 from paddle.fluid.layers import BeamSearchDecoder
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
-    'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
+    'RNNCell',
+    'BasicLSTMCell',
+    'BasicGRUCell',
+    'RNN',
+    'BidirectionalRNN',
+    'StackedRNNCell',
+    'StackedLSTMCell',
+    'LSTM',
+    'BidirectionalLSTM',
+    'StackedGRUCell',
+    'GRU',
+    'BidirectionalGRU',
+    'DynamicDecode',
+    'BeamSearchDecoder',
+    'Conv1dPoolLayer',
+    'CNNEncoder',
+    'MultiHeadAttention',
+    'FFN',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'TransformerCell',
+    'TransformerBeamSearchDecoder',
+    'LinearChainCRF',
+    'CRFDecoding',
+    'SequenceTagging',
 ]
 
 
 class RNNCell(Layer):
+    """
+    RNNCell is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
     def get_initial_states(self,
                            batch_ref,
                            shape=None,
@@ -68,16 +85,18 @@ def get_initial_states(self,
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
+            shape: A (possibly nested structure of) shape[s], where a shape is
                 represented as a list/tuple of integer). -1(for batch size) will
                 beautomatically inserted if shape is not started with it. If None,
                 property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
+            dtype: A (possibly nested structure of) data type[s]. The structure
                 must be same as that of `shape`, except when all tensors' in states
                 has the same data type, a single data type can be used. If None and
                 property `cell.state_shape` is not available, float32 will be used
                 as the data type. The default value is None.
             init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
 
         Returns:
             Variable: tensor variable[s] packed in the same structure provided \
@@ -168,46 +187,54 @@ def state_dtype(self):
 
 class BasicLSTMCell(RNNCell):
     """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
+    Long-Short Term Memory(LSTM) RNN cell.
+
+    The formula used is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias(float, optional): forget bias used when computing forget gate.
+            Default 1.0
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
@@ -218,19 +245,7 @@ def __init__(self,
                  gate_activation=None,
                  activation=None,
                  forget_bias=1.0,
-                 dtype='float32',
-                 forget_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 input_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 output_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
+                 dtype='float32'):
         super(BasicLSTMCell, self).__init__()
 
         self._hidden_size = hidden_size
@@ -241,276 +256,115 @@ def __init__(self,
         # TODO(guosheng): find better way to resolve constants in __init__
         self._forget_bias = layers.create_global_var(
             shape=[1], dtype=dtype, value=forget_bias, persistable=True)
-        self._forget_bias.stop_gradient = False
+        self._forget_bias.stop_gradient = True
         self._dtype = dtype
         self._input_size = input_size
 
-        self.use_customized_weight = False
-        for _weights in [
-                forget_gate_weights, input_gate_weights, output_gate_weights,
-                cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-                    break
-            if self.use_customized_weight:
-                break
-
-        if not self.use_customized_weight:
-
-            self._weight = self.create_parameter(
-                attr=self._param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 4 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            if "w" in forget_gate_weights and forget_gate_weights[
-                    "w"] is not None:
-                self.fg_w = forget_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in forget_gate_weights and forget_gate_weights[
-                    "h"] is not None:
-                self.fg_h = forget_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in forget_gate_weights and forget_gate_weights[
-                    "b"] is not None:
-                self.fg_b = forget_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_forget_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.fg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in input_gate_weights and input_gate_weights[
-                    "w"] is not None:
-                self.ig_w = input_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in input_gate_weights and input_gate_weights[
-                    "h"] is not None:
-                self.ig_h = input_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in input_gate_weights and input_gate_weights[
-                    "b"] is not None:
-                self.ig_b = input_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_input_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.ig_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in output_gate_weights and output_gate_weights[
-                    "w"] is not None:
-                self.og_w = output_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.og_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in output_gate_weights and output_gate_weights[
-                    "h"] is not None:
-                self.og_h = output_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.og_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in output_gate_weights and output_gate_weights[
-                    "b"] is not None:
-                self.og_b = output_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_output_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.og_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            weight_w = fluid.layers.concat(
-                [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
-            weight_h = fluid.layers.concat(
-                [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
-            _weight = fluid.layers.concat([weight_w, weight_h], axis=0)
-            _bias = fluid.layers.concat(
-                [self.ig_b, self.c_b, self.fg_b, self.og_b])
-        else:
-            _weight = self._weight
-            _bias = self._bias
+        self._weight = self.create_parameter(
+            attr=self._param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 4 * self._hidden_size
+            ],
+            dtype=self._dtype)
+
+        self._bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[4 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
 
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
+    def forward(self, inputs, states):
+        """
+        Performs single step LSTM calculations.
+
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A list of containing two tensors, each shaped
+                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
+                in the formula. The data type should be float32 or float64.
 
-        gate_input = layers.elementwise_add(gate_input, _bias)
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula; `new_states` is a list containing \
+                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
+                tensors all is same as that of `states`.
+        """
+        pre_hidden, pre_cell = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.elementwise_add(gate_input, self._bias)
         i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
         new_cell = layers.elementwise_add(
             layers.elementwise_mul(
                 pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
+                self._gate_activation(
+                    layers.elementwise_add(f, self._forget_bias))),
+            layers.elementwise_mul(
+                self._gate_activation(i), self._activation(j)))
+        new_hidden = self._activation(new_cell) * self._gate_activation(o)
 
         return new_hidden, [new_hidden, new_cell]
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
         return [[self._hidden_size], [self._hidden_size]]
 
 
 class BasicGRUCell(RNNCell):
     """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
+    Gated Recurrent Unit (GRU) RNN cell.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicGRUCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
@@ -520,16 +374,7 @@ def __init__(self,
                  bias_attr=None,
                  gate_activation=None,
                  activation=None,
-                 dtype='float32',
-                 update_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 reset_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
+                 dtype='float32'):
         super(BasicGRUCell, self).__init__()
         self._input_size = input_size
         self._hidden_size = hidden_size
@@ -539,20 +384,6 @@ def __init__(self,
         self._activation = activation or layers.tanh
         self._dtype = dtype
 
-        assert isinstance(update_gate_weights, dict)
-        assert isinstance(reset_gate_weights, dict)
-        assert isinstance(cell_weights, dict)
-
-        self.use_customized_weight = False
-        for _weights in [
-                update_gate_weights, reset_gate_weights, cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-            if self.use_customized_weight:
-                break
-
         if self._param_attr is not None and self._param_attr.name is not None:
             gate_param_attr = copy.deepcopy(self._param_attr)
             candidate_param_attr = copy.deepcopy(self._param_attr)
@@ -562,194 +393,62 @@ def __init__(self,
             gate_param_attr = self._param_attr
             candidate_param_attr = self._param_attr
 
-        if not self.use_customized_weight:
-            self._gate_weight = self.create_parameter(
-                attr=gate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 2 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._candidate_weight = self.create_parameter(
-                attr=candidate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                gate_bias_attr = copy.deepcopy(self._bias_attr)
-                candidate_bias_attr = copy.deepcopy(self._bias_attr)
-                gate_bias_attr.name += "_gate"
-                candidate_bias_attr.name += "_candidate"
-            else:
-                gate_bias_attr = self._bias_attr
-                candidate_bias_attr = self._bias_attr
-
-            self._gate_bias = self.create_parameter(
-                attr=gate_bias_attr,
-                shape=[2 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-            self._candidate_bias = self.create_parameter(
-                attr=candidate_bias_attr,
-                shape=[self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
+        self._gate_weight = self.create_parameter(
+            attr=gate_param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 2 * self._hidden_size
+            ],
+            dtype=self._dtype)
 
-        else:
+        self._candidate_weight = self.create_parameter(
+            attr=candidate_param_attr,
+            shape=[self._input_size + self._hidden_size, self._hidden_size],
+            dtype=self._dtype)
 
-            # create the parameters of gates in gru
-            if "w" in update_gate_weights and update_gate_weights[
-                    "w"] is not None:
-                self.ug_w = update_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in update_gate_weights and update_gate_weights[
-                    "h"] is not None:
-                self.ug_h = update_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in update_gate_weights and update_gate_weights[
-                    "b"] is not None:
-                self.ug_b = update_gate_weights["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_update_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.ug_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # reset gate parameters
-            if "w" in reset_gate_weights and reset_gate_weights[
-                    "w"] is not None:
-                self.rg_w = reset_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in reset_gate_weights and reset_gate_weights[
-                    "h"] is not None:
-                self.rg_h = reset_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in reset_gate_weights and reset_gate_weights[
-                    "b"] is not None:
-                self.rg_b = reused_params["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_reset_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.rg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # cell parameters
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
-            ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
-            _gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
-            _candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
-            _gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
-            _candidate_bias = self.c_b
+        if self._bias_attr is not None and self._bias_attr.name is not None:
+            gate_bias_attr = copy.deepcopy(self._bias_attr)
+            candidate_bias_attr = copy.deepcopy(self._bias_attr)
+            gate_bias_attr.name += "_gate"
+            candidate_bias_attr.name += "_candidate"
         else:
-            _gate_weight = self._gate_weight
-            _gate_bias = self._gate_bias
-            _candidate_weight = self._candidate_weight
-            _candidate_bias = self._candidate_bias
+            gate_bias_attr = self._bias_attr
+            candidate_bias_attr = self._bias_attr
+
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
+            shape=[2 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
+            shape=[self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
 
-        pre_hidden = state
-        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
+    def forward(self, inputs, states):
+        """
+        Performs single step GRU calculations.
 
-        gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
+                corresponding to :math:`h_{t-1}` in the formula. The data type
+                should be float32 or float64.
 
-        gate_input = layers.elementwise_add(gate_input, _gate_bias)
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
+                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
+                corresponding to :math:`h_t` in the formula. The data type of the \
+                tensor is same as that of `states`.        
+        """
+        pre_hidden = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
+
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+
+        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
 
         gate_input = self._gate_activation(gate_input)
         r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -757,8 +456,8 @@ def forward(self, input, state):
         r_hidden = r * pre_hidden
 
         candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), _candidate_weight)
-        candidate = layers.elementwise_add(candidate, _candidate_bias)
+            layers.concat([inputs, r_hidden], 1), self._candidate_weight)
+        candidate = layers.elementwise_add(candidate, self._candidate_bias)
 
         c = self._activation(candidate)
         new_hidden = u * pre_hidden + (1 - u) * c
@@ -767,10 +466,44 @@ def forward(self, input, state):
 
     @property
     def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
         return [self._hidden_size]
 
 
-class RNN(fluid.dygraph.Layer):
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
     def __init__(self, cell, is_reverse=False, time_major=False):
         super(RNN, self).__init__()
         self.cell = cell
@@ -786,6 +519,38 @@ def forward(self,
                 initial_states=None,
                 sequence_length=None,
                 **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
         if fluid.in_dygraph_mode():
 
             class ArrayWrapper(object):
@@ -874,7 +639,1192 @@ def _maybe_copy(state, new_state, step_mask):
         return final_outputs, final_states
 
 
+class StackedRNNCell(RNNCell):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
+
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+
+            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
+
+    def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
+
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
+
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" %
+                n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class StackedLSTMCell(RNNCell):
+    """
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size))
+                        .astype(dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class LSTM(Layer):
+    """
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
+
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        return self.lstm(inputs, initial_states, sequence_length)
+
+
+class BidirectionalRNN(Layer):
+    """
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
+        else:
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
+
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`, `final_states` is a pair including `final_states` \
+                of forward and backward RNN.
+        """
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(
+            self.merge_func, outputs_fw,
+            outputs_bw) if self.merge_func else (outputs_fw, outputs_bw)
+        return outputs, (states_fw, states_bw)
+
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
+
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
+
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
+
+
+class BidirectionalLSTM(Layer):
+    """
+    Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
+    input sequence. 
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalLSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_lstm(inputs)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[0], bias_attrs[0],
+                                      dtype)
+            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[1], bias_attrs[1],
+                                      dtype)
+            self.lstm = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+
+            # maybe design cell including both forward and backward later
+            self.lstm = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    fw_param_attrs[i], fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(
+                    input_size if i == 0 else (hidden_size * 2
+                                               if merge_mode == 'concat' else
+                                               hidden_size), hidden_size,
+                    gate_activation, activation, forget_bias, 1, dropout,
+                    bw_param_attrs[i], bw_bias_attrs[i], dtype)
+                self.lstm.append(
+                    self.add_sublayer(
+                        "lstm_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
+
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional LSTM; `final_states` is a \
+                pair including `final_states` of forward and backward LSTM when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional LSTM, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.lstm(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.lstm[i](inputs, initial_states[i],
+                                               sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+
+
+class StackedGRUCell(RNNCell):
+    """
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedGRUCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    BasicGRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class GRU(Layer):
+    """
+    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input feature size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import GRU
+
+            inputs = paddle.rand((2, 4, 32))
+            gru = GRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = gru(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRU, self).__init__()
+        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                  activation, num_layers, dropout, param_attr,
+                                  bias_attr, dtype)
+        self.gru = RNN(gru_cell, is_reverse, time_major)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one.
+
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU, and the initial states of each GRU is a tensor
+                shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last GRU and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types.
+        """
+        return self.gru(inputs, initial_states, sequence_length)
+
+
+class BidirectionalGRU(Layer):
+    """
+    Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+    
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input feature size  for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalGRU
+
+            inputs = paddle.rand((2, 4, 32))
+            bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_gru(inputs)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalGRU, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[0], bias_attrs[0], dtype)
+            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[1], bias_attrs[1], dtype)
+            self.gru = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+
+            # maybe design cell including both forward and backward later
+            self.gru = []
+            for i in range(num_layers):
+                cell_fw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, fw_param_attrs[i],
+                                         fw_bias_attrs[i], dtype)
+                cell_bw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, bw_param_attrs[i],
+                                         bw_bias_attrs[i], dtype)
+                self.gru.append(
+                    self.add_sublayer(
+                        "gru_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
+
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional GRU; `final_states` is a \
+                pair including `final_states` of forward and backward GRU when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional GRU, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.gru(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.gru[i](inputs, initial_states[i],
+                                              sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+
+
 class DynamicDecode(Layer):
+    """
+    DynamicDecode integrates an Decoder instance to perform dynamic decoding.
+
+    It performs :code:`decoder.step()` repeatedly until the returned Tensor
+    indicating finished status contains all True values or the number of
+    decoding step reaches to :attr:`max_step_num`.
+
+    :code:`decoder.initialize()` would be called once before the decoding loop.
+    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+    would be called once after the decoding loop.
+
+    Parameters:
+        decoder (Decoder): An instance of `Decoder`.
+        max_step_num (int, optional): The maximum number of steps. If not provided,
+            decode until the decoder is fully done, or in other words, the returned
+            Tensor by :code:`decoder.step()` indicating finished status contains
+            all True. Default `None`.
+        output_time_major (bool, optional): Indicate the data layout of Tensor included
+            in the final outputs(the first returned value of this method). If
+            attr:`False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
+            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
+        impute_finished (bool, optional): If `True`, then states get copied through
+            for batch entries which are marked as finished, which differs with the
+            unfinished using the new states returned by :code:`decoder.step()` and
+            ensures that the final states have the correct values. Otherwise, states
+            wouldn't be copied through when finished. If the returned `final_states`
+            is needed, it should be set as True, which causes some slowdown.
+            Default `False`.
+        is_test (bool, optional): A flag indicating whether to use test mode. In
+            test mode, it is more memory saving. Default `False`.
+        return_length (bool, optional):  A flag indicating whether to return an
+            extra Tensor variable in the output tuple, which stores the actual
+            lengths of all decoded sequences. Default `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.layers import BeamSearchDecoder
+            from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
+
+            paddle.enable_dygraph()
+
+            vocab_size, d_model, = 100, 32
+            encoder_output = paddle.rand((2, 4, d_model))
+            trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
+            output_layer = fluid.dygraph.Linear(d_model, vocab_size)
+            cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
+            decoder = BeamSearchDecoder(cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+            dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
+            outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
+    """
+
     def __init__(self,
                  decoder,
                  max_step_num=None,
@@ -891,6 +1841,35 @@ def __init__(self,
         self.return_length = return_length
 
     def forward(self, inits=None, **kwargs):
+        """
+        Performs :code:`decoder.step()` repeatedly until the returned Tensor
+        indicating finished status contains all True values or the number of
+        decoding step reaches to :attr:`max_step_num`.
+
+        :code:`decoder.initialize()` would be called once before the decoding loop.
+        If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+        would be called once after the decoding loop.
+
+        Parameters:
+            inits (object, optional): Argument passed to `decoder.initialize`.
+                Default `None`.
+            **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
+                when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
+                The final outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as the :code:`outputs` \
+                returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
+                is the stacked of all decoding steps' outputs, which might be revised \
+                by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
+                `final_states` is the counterpart at last time step of initial states \
+                returned by :code:`decoder.initialize()` , thus has the same structure \
+                with it and has tensors with same shapes and data types. `sequence_lengths` \
+                is an `int64` tensor with the same shape as `finished` returned \
+                by :code:`decoder.initialize()` , and it stores the actual lengths of \
+                all decoded sequences.
+        """
         if fluid.in_dygraph_mode():
 
             class ArrayWrapper(object):
@@ -982,47 +1961,538 @@ def _maybe_copy(state, new_state, step_mask):
             except NotImplementedError:
                 pass
 
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
+            if not self.output_time_major:
+                final_outputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), final_outputs)
+
+            return (final_outputs, final_states,
+                    sequence_lengths) if self.return_length else (
+                        final_outputs, final_states)
+        else:
+            return fluid.layers.dynamic_decode(
+                self.decoder,
+                inits,
+                max_step_num=self.max_step_num,
+                output_time_major=self.output_time_major,
+                impute_finished=self.impute_finished,
+                is_test=self.is_test,
+                return_length=self.return_length,
+                **kwargs)
+
+
+class Conv1dPoolLayer(Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1DPoolLayer``
+    class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .
+    For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates
+    the output based on the input, filter and strides, paddings, dilations, groups,
+    global_pooling, pool_type, ceil_mode, exclusive parameters.
+
+    Parameters:
+        num_channels (int): The number of channels in the input data.
+        num_filters(int): The number of filters. It is the same as the output channels.
+        filter_size (int): The filter size of Conv1DPoolLayer.       
+        pool_size (int): The pooling size of Conv1DPoolLayer.
+        conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer.
+            Default: 1
+        pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer.
+            Default: 1
+        conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer.
+            Default: 0
+        pool_padding (int): The padding of pool layer in Conv1DPoolLayer.
+            Default: 0
+        act (str): Activation type for conv layer, if it is set to None, activation
+            is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        dilation (int): The dilation size of the conv Layer. Default: 1.
+        groups (int): The groups number of the conv Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the
+            first half of the filters is only connected to the first half of the
+            input channels, while the second half of the filters is only connected
+            to the second half of the input channels. Default: 1.
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+                `pool_size` and `pool_padding` would be ignored. Default: False
+        ceil_mode (bool, optional): Whether to use the ceil function to calculate output 
+                height and width.False is the default. If it is set to False, the floor function 
+                will be used. Default: False.
+        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. 
+                Default: True.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import Conv1dPoolLayer
+
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 4))
+            cov2d = Conv1dPoolLayer(num_channels=32,
+                                    num_filters=64,
+                                    filter_size=2,
+                                    pool_size=2)
+            output = cov2d(input)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 dilation=1,
+                 groups=None,
+                 ceil_mode=False,
+                 exclusive=True,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(Conv1dPoolLayer, self).__init__()
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[filter_size, 1],
+            stride=[conv_stride, 1],
+            padding=[conv_padding, 0],
+            dilation=[dilation, 1],
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act)
+        self._pool2d = Pool2D(
+            pool_size=[pool_size, 1],
+            pool_type=pool_type,
+            pool_stride=[pool_stride, 1],
+            pool_padding=[pool_padding, 0],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)
+
+    def forward(self, input):
+        """
+        Performs conv1d and pool1d on the input.
+
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        
+        Returns:
+            Variable: The 3-D output tensor after conv and pool. It has the same \
+                data type as input.
+        """
+        x = fluid.layers.unsqueeze(input, axes=[-1])
+        x = self._conv2d(x)
+        x = self._pool2d(x)
+        x = fluid.layers.squeeze(x, axes=[-1])
+        return x
+
 
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
+class CNNEncoder(Layer):
+    """
+    This interface is used to construct a callable object of the ``CNNEncoder``
+    class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` .
+    ``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters.
+    The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every 
+    ``Conv1dPoolLayer`` will concat at the channel dimension as the final output.
+
+    Parameters:
+        num_channels(int|list|tuple): The number of channels in the input data. If
+            `num_channels` is a list or tuple, the length of `num_channels` must
+            equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's
+            `num_channels` are the value of `num_channels`. 
+        num_filters(int|list|tuple): The number of filters. It is the same as the
+            output channels. If `num_filters` is a list or tuple, the length of
+            `num_filters` must equal `num_layers`. If `num_filters` is a int,
+            all conv1dpoollayer's `num_filters` are the value of `num_filters`.
+        filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder.
+            If `filter_size` is a list or tuple, the length of `filter_size` must
+            equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's
+            `filter_size` are the value of `filter_size`. 
+        pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.
+            If `pool_size` is a list or tuple, the length of `pool_size` must equal
+            `num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size`
+            are the value of `pool_size`.
+        num_layers(int): The number of conv1dpoolLayer used in CNNEncoder.
+        conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer.
+            If `conv_stride` is a list or tuple, the length of `conv_stride` must
+            equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride`
+            are the value of `conv_stride`. Default: 1
+        pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer.
+            If `pool_stride` is a list or tuple, the length of `pool_stride` must
+            equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride`
+            are the value of `pool_stride`. Default: 1
+        conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.
+            If `conv_padding` is a list or tuple, the length of `conv_padding` must
+            equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding`
+            are the value of `conv_padding`. Default: 0
+        pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer.
+            If `pool_padding` is a list or tuple, the length of `pool_padding` must
+            equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding`
+            are the value of `pool_padding`. Default: 0
+        act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None,
+            activation is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+            `pool_size` and `pool_padding` would be ignored. Default: False
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+    
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CNNEncoder
+
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 8))
+            cov_encoder = CNNEncoder(num_layers=2,
+                                     num_channels=32,
+                                     num_filters=64,
+                                     filter_size=[2, 3],
+                                     pool_size=[7, 6])
+            output = cov_encoder(input)  # [2, 128, 1]
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 num_layers=1,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 use_cudnn=False):
+        super(CNNEncoder, self).__init__()
+        self.num_layers = num_layers
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.pool_size = pool_size
+        self.conv_stride = conv_stride
+        self.pool_stride = pool_stride
+        self.conv_padding = conv_padding
+        self.pool_padding = pool_padding
+        self.use_cudnn = use_cudnn
+        self.act = act
+        self.pool_type = pool_type
+        self.global_pooling = global_pooling
+        self.conv1d_pool_layers = fluid.dygraph.LayerList([
+            Conv1dPoolLayer(
+                num_channels=self.num_channels if
+                isinstance(self.num_channels, int) else self.num_channels[i],
+                num_filters=self.num_filters
+                if isinstance(self.num_channels, int) else self.num_filters[i],
+                filter_size=self.filter_size
+                if isinstance(self.filter_size, int) else self.filter_size[i],
+                pool_size=self.pool_size
+                if isinstance(self.pool_size, int) else self.pool_size[i],
+                conv_stride=self.conv_stride
+                if isinstance(self.conv_stride, int) else self.conv_stride[i],
+                pool_stride=self.pool_stride
+                if isinstance(self.pool_stride, int) else self.pool_stride[i],
+                conv_padding=self.conv_padding
+                if isinstance(self.conv_padding,
+                              int) else self.conv_padding[i],
+                pool_padding=self.pool_padding
+                if isinstance(self.pool_padding,
+                              int) else self.pool_padding[i],
+                act=self.act[i]
+                if isinstance(self.act, (list, tuple)) else self.act,
+                pool_type=self.pool_type,
+                global_pooling=self.global_pooling,
+                use_cudnn=self.use_cudnn) for i in range(num_layers)
+        ])
+
+    def forward(self, input):
+        """
+        Performs multiple parallel conv1d and pool1d, and concat the results of
+        them at the channel dimension to produce the final output.
+
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        
+        Returns:
+            Variable: The 3-D output tensor produced by concatenating results of \
+                all Conv1dPoolLayer. It has the same data type as input.
+        """
+        res = [
+            conv1d_pool_layer(input)
+            for conv1d_pool_layer in self.conv1d_pool_layers
+        ]
+        out = fluid.layers.concat(input=res, axis=1)
+        return out
 
 
-class TransfomerCell(object):
+class TransformerCell(RNNCell):
     """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
+    TransformerCell wraps a Transformer decoder producing logits from `inputs`
+    composed by ids and position.
+
+    Parameters:
+        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
+            includes a embedding layer accepting ids and positions instead of embeddings
+            and includes a output layer transforming decoder output features to logits.
+        embedding_fn(function, optional): A callable that accepts ids and position
+            as arguments and return embeddings as input of `decoder`. It can be
+            None if `decoder` includes a embedding layer. Default None.
+        output_fn(callable, optional): A callable applid on `decoder` output to
+            transform decoder output features to get logits. Mostly it is a Linear
+            layer with vocabulary size. It can be None if `decoder` includes a
+            output layer. Default None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.enable_dygraph()
+
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
     """
 
-    def __init__(self, decoder):
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
         self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
+
+    def forward(self,
+                inputs,
+                states=None,
+                enc_output=None,
+                trg_slf_attn_bias=None,
+                trg_src_attn_bias=None,
+                static_caches=[]):
+        """
+        Produces logits from `inputs` composed by ids and positions.
+
+        Parameters:
+            inputs(tuple): A tuple includes target ids and positions. The two
+                tensors both have int64 data type and with 2D shape 
+                `[batch_size, sequence_length]` where `sequence_length` is 1
+                for inference.
+            states(list): It caches the multi-head attention intermediate results
+                of history decoding steps. It is a list of dict where the length
+                of list is decoder layer number, and each dict has `k` and `v` as
+                keys and values are cached results. Default None
+            enc_output(Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data type
+                should be float32 or float64.
+            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
+                attention to mask out attention on unwanted target positions. It
+                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. It can be None for inference. The data type should
+                be float32 or float64. Default None
+            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
+                cross attention to mask out unwanted attention on source (encoder output).
+                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. The data type should be float32 or float64. Default None
+            static_caches(list): It stores projected results of encoder output
+                to be used as keys and values in decoder-encoder cross attention
+                It is a list of dict where the length of list is decoder layer
+                number, and each dict has `static_k` and `static_v` as keys and
+                values are stored results. Default empty list
 
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
+                is a float32 or float64 3D tensor representing logits shaped \
+                `[batch_size, sequence_length, vocab_size]`. `new_states has \
+                the same structure and data type with `states` while the length \
+                is one larger since the intermediate results of current step are \
+                concatenated into it.
+        """
         trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
+        if states and static_caches:
+            for cache, static_cache in zip(states, static_caches):
+                cache.update(static_cache)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+
+        new_states = [{
+            "k": cache["k"],
+            "v": cache["v"]
+        } for cache in states] if states else states
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        States of TransformerCell cache the multi-head attention intermediate
+        results of history decoding steps, and have a increasing length as
+        decoding continued.
+        
+        `state_shape` of TransformerCell is used to initialize states. It is a
+        list of dict where the length of list is decoder layer, and each dict
+        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
+        separately. (-1 for batch size would be automatically inserted into shape).
+
+        Returns:
+            list: It is a list of dict where the length of list is decoder layer \
+                number, and each dict has `k` and `v` as keys and values are cached \
+                results.
+        """
+        return [{
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(self.decoder.n_layer)]
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    """
+    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
+    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
+    and includes extra position data. And its `states` (caches) has increasing
+    length. These are not consistent with `BeamSearchDecoder`, thus subclass
+    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
+
+    Parameters:
+        cell(TransformerCell): An instance of `TransformerCell`.
+        start_token(int): The start token id.
+        end_token(int): The end token id.
+        beam_size(int): The beam width used in beam search.
+        var_dim_in_state(int): Indicate which dimension of states is variant.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.enable_dygraph()
+
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+
     def __init__(self, cell, start_token, end_token, beam_size,
                  var_dim_in_state):
         super(TransformerBeamSearchDecoder,
@@ -1031,6 +2501,18 @@ def __init__(self, cell, start_token, end_token, beam_size,
         self.var_dim_in_state = var_dim_in_state
 
     def _merge_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
+        tensor with shape `[batch_size * beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
         # init length of cache is 0, and it increases with decoding carrying on,
         # thus need to reshape elaborately
         var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
@@ -1048,6 +2530,18 @@ def _merge_batch_beams_with_var_dim(self, x):
         return x
 
     def _split_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
+        tensor with shape `[batch_size, beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.     
+        """
         var_dim_size = layers.shape(x)[self.var_dim_in_state]
         x = layers.reshape(
             x, [-1, self.beam_size] +
@@ -1057,6 +2551,38 @@ def _split_batch_beams_with_var_dim(self, x):
         return x
 
     def step(self, time, inputs, states, **kwargs):
+        """
+        Perform a beam search decoding step, which uses `cell` to get probabilities,
+        and follows a beam search step to calculate scores and select candidate
+        token ids.
+
+        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
+        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
+        position data as inputs to `cell`.
+
+        Parameters:
+            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            inputs(Variable): A tensor variable. It is same as `initial_inputs`
+                returned by `initialize()` for the first decoding step and
+                `next_inputs` returned by `step()` for the others. It is a int64
+                id tensor with shape `[batch_size * beam_size]`
+            states(Variable): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+            **kwargs: Additional keyword arguments, provided by the caller. 
+        
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
+                `beam_search_state` and `next_inputs` have the same structure, \
+                shape and data type as the input arguments `states` and `inputs` separately. \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
+                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
+        """
         # compared to RNN, Transformer has 3D data at every decoding step
         inputs = layers.reshape(inputs, [-1, 1])  # token
         pos = layers.ones_like(inputs) * time  # pos
@@ -1065,6 +2591,11 @@ def step(self, time, inputs, states, **kwargs):
 
         cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
                                                    **kwargs)
+
+        # squeeze to adapt to BeamSearchDecoder which use 2D logits
+        cell_outputs = map_structure(
+            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
+            cell_outputs)
         cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
         next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
                                          next_cell_states)
@@ -1083,14 +2614,34 @@ def step(self, time, inputs, states, **kwargs):
 ### Transformer Modules ###
 class PrePostProcessLayer(Layer):
     """
-    PrePostProcessLayer
+    PrePostProcessLayer is used before/after each multi-head attention(MHA) and
+    feed-forward network(FFN) sub-layer to perform some specific process on
+    inputs/outputs.
+
+    Parameters:
+        process_cmd (str): The process applied before/after each MHA and
+            FFN sub-layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization.
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float): The dropout probability if the process includes
+            dropout. Default 0.1
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import PrePostProcessLayer
+
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            process = PrePostProcessLayer('n', 32)
+            out = process(x)  # [2, 4, 32]
     """
 
-    def __init__(self,
-                 process_cmd,
-                 d_model,
-                 dropout_rate,
-                 reused_layer_norm=None):
+    def __init__(self, process_cmd, d_model, dropout_rate=0.1):
         super(PrePostProcessLayer, self).__init__()
         self.process_cmd = process_cmd
         self.functors = []
@@ -1099,15 +2650,12 @@ def __init__(self,
                 self.functors.append(
                     lambda x, y: x + y if y is not None else x)
             elif cmd == "n":  # add layer normalization
-                if reused_layer_norm is not None:
-                    layer_norm = reused_layer_norm
-                else:
-                    layer_norm = LayerNorm(
-                        normalized_shape=d_model,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.)),
-                        bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.)))
+                layer_norm = LayerNorm(
+                    normalized_shape=d_model,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
 
                 self.functors.append(
                     self.add_sublayer(
@@ -1120,6 +2668,21 @@ def __init__(self,
                                      if dropout_rate else x)
 
     def forward(self, x, residual=None):
+        """
+        Applies `process_cmd` specified process on `x`.
+
+        Parameters:
+            x (Variable): The tensor to be processed. The data type should be float32
+                or float64. The shape is `[batch_size, sequence_length, d_model]`.
+                
+            residual (Variable, optional): Only used if the process includes
+                residual connection. It has the same shape and data type as `x`.
+                Default None
+
+        Returns:
+            Variable: The processed tensor. It has the same shape and data type \
+                    as `x`.
+        """
         for i, cmd in enumerate(self.process_cmd):
             if cmd == "a":
                 x = self.functors[i](x, residual)
@@ -1130,19 +2693,40 @@ def forward(self, x, residual=None):
 
 class MultiHeadAttention(Layer):
     """
-    Multi-Head Attention
+    MultiHead Attention mapps queries and a set of key-value pairs to outputs
+    by jointly attending to information from different representation subspaces,
+    as what multi-head indicates it performs multiple attention in parallel.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        dropout_rate (float, optional): The dropout probability used in MHA to
+            drop some attention target. Default 0.1
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import MultiHeadAttention
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
+            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.0,
-                 reused_query_fc=None,
-                 reused_key_fc=None,
-                 reused_value_fc=None,
-                 reused_proj_fc=None):
+    def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
 
         super(MultiHeadAttention, self).__init__()
         self.n_head = n_head
@@ -1151,32 +2735,47 @@ def __init__(self,
         self.d_model = d_model
         self.dropout_rate = dropout_rate
 
-        if reused_query_fc is not None:
-            self.q_fc = reused_query_fc
-        else:
-            self.q_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_key_fc is not None:
-            self.k_fc = reused_key_fc
-        else:
-            self.k_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_value_fc is not None:
-            self.v_fc = reused_value_fc
-        else:
-            self.v_fc = Linear(
-                input_dim=d_model,
-                output_dim=d_value * n_head,
-                bias_attr=False)
-        if reused_proj_fc is not None:
-            self.proj_fc = reused_proj_fc
-        else:
-            self.proj_fc = Linear(
-                input_dim=d_value * n_head,
-                output_dim=d_model,
-                bias_attr=False)
+        self.q_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.k_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.v_fc = Linear(
+            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+        self.proj_fc = Linear(
+            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
 
     def _prepare_qkv(self, queries, keys, values, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple attention in parallel. If `cache` is not None, using cached
+        results to reduce redundant calculations.
+
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
         if keys is None:  # self-attention
             keys, values = queries, queries
             static_kv = False
@@ -1212,13 +2811,53 @@ def _prepare_qkv(self, queries, keys, values, cache=None):
 
         return q, k, v
 
-    def forward(self, queries, keys, values, attn_bias, cache=None):
+    def forward(self,
+                queries,
+                keys=None,
+                values=None,
+                attn_bias=None,
+                cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias (Variable, optional): A tensor used in multi-head attention
+                to mask out attention on unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+
+        Returns:
+            Variable: The output of multi-head attention. It is a tensor \
+                that has the same shape and data type as `queries`.
+        """
         # compute q ,k ,v
         q, k, v = self._prepare_qkv(queries, keys, values, cache)
 
         # scale dot product attention
         product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+            x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = layers.softmax(product)
@@ -1237,6 +2876,25 @@ def forward(self, queries, keys, values, attn_bias, cache=None):
         return out
 
     def cal_kv(self, keys, values):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces for usage of subsequnt multiple attention in parallel.
+
+        Parameters:
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
         k = self.k_fc(keys)
         v = self.v_fc(values)
         k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
@@ -1248,29 +2906,53 @@ def cal_kv(self, keys, values):
 
 class FFN(Layer):
     """
-    Feed-Forward Network
+    A fully connected feed-forward network applied to each position separately
+    and identically. This consists of two linear transformations with a activation
+    and dropout in between.
+
+    Parameters:
+        d_inner_hid (int): The hidden size in the feedforward network(FFN).
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used after
+            activition. Default 0.1
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import FFN
+
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            ffn = FFN(128, 32)
+            out = ffn(x)  # [2, 4, 32]
     """
 
-    def __init__(self,
-                 d_inner_hid,
-                 d_model,
-                 dropout_rate,
-                 fc1_act="relu",
-                 reused_fc1=None,
-                 reused_fc2=None):
+    def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
         super(FFN, self).__init__()
         self.dropout_rate = dropout_rate
-        if reused_fc1 is not None:
-            self.fc1 = reused_fc1
-        else:
-            self.fc1 = Linear(
-                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        if reused_fc2 is not None:
-            self.fc2 = reused_fc2
-        else:
-            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+        self.fc1 = Linear(
+            input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
 
     def forward(self, x):
+        """
+        Applies a fully connected feed-forward network on each position  of the
+        input sequences separately and identically.
+
+        Parameters:
+            x (Variable): The input of feed-forward network. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+
+        Returns:
+            Variable: The output of feed-forward network. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
         hidden = self.fc1(x)
         if self.dropout_rate:
             hidden = layers.dropout(
@@ -1281,7 +2963,50 @@ def forward(self, x):
 
 class TransformerEncoderLayer(Layer):
     """
-    EncoderLayer
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output.
+
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -1290,58 +3015,47 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
-                 ffn_fc1_act="relu",
-                 reused_pre_selatt_layernorm=None,
-                 reused_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 ffn_fc1_act="relu"):
 
         super(TransformerEncoderLayer, self).__init__()
 
         self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
-            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
-            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       fc1_act=ffn_fc1_act,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
         self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                                                  prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a Transformer encoder layer on the input.
 
-    def forward(self, enc_input, attn_bias):
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+
+        Returns:
+            Variable: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
         attn_output = self.self_attn(
             self.preprocesser1(enc_input), None, None, attn_bias)
         attn_output = self.postprocesser1(attn_output, enc_input)
@@ -1353,7 +3067,49 @@ def forward(self, enc_input, attn_bias):
 
 class TransformerEncoder(Layer):
     """
-    encoder
+    TransformerEncoder is a stack of N encoder layers.
+
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoder
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
+            enc_output = encoder(enc_input, attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -1363,9 +3119,9 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
                  ffn_fc1_act="relu"):
@@ -1392,7 +3148,25 @@ def __init__(self,
         self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
                                              prepostprocess_dropout)
 
-    def forward(self, enc_input, attn_bias):
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a stack of N Transformer encoder layers on input sequences.
+
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+
+        Returns:
+            Variable: The output of Transformer encoder. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
         for encoder_layer in self.encoder_layers:
             enc_output = encoder_layer(enc_input, attn_bias)
             enc_input = enc_output
@@ -1402,7 +3176,58 @@ def forward(self, enc_input, attn_bias):
 
 class TransformerDecoderLayer(Layer):
     """
-    decoder
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output.
+
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoderLayer
+
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_bias,
+                                   cross_attn_bias)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -1411,90 +3236,72 @@ def __init__(self,
                  d_value,
                  d_model,
                  d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
                  preprocess_cmd="n",
                  postprocess_cmd="da",
-                 reused_pre_selfatt_layernorm=None,
-                 reused_self_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_crossatt_layernorm=None,
-                 reused_cross_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_crossatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
+                 ffn_fc1_act="relu"):
         super(TransformerDecoderLayer, self).__init__()
 
         self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selfatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_self_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_self_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_crossatt_layernorm)
-        self.cross_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_cross_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_cross_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_cross_multihead_att_weights[
-                "reused_proj_fc"])
-        self.postprocesser2 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_crossatt_layernorm)
+                                                 prepostprocess_dropout)
+        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                             attention_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
 
         self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
         self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
+                                                  prepostprocess_dropout)
 
     def forward(self,
                 dec_input,
                 enc_output,
-                self_attn_bias,
-                cross_attn_bias,
+                self_attn_bias=None,
+                cross_attn_bias=None,
                 cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(dict, optional): It caches the multi-head attention intermediate
+                results of history decoding steps and encoder output. It is a dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+
+        Returns:
+            Variable: The output of Transformer decoder layer. It is a tensor \
+                that has the same shape and data type as `dec_input`.
+        """
         self_attn_output = self.self_attn(
             self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
         self_attn_output = self.postprocesser1(self_attn_output, dec_input)
@@ -1513,14 +3320,78 @@ def forward(self,
 
 class TransformerDecoder(Layer):
     """
-    decoder
+    TransformerDecoder is a stack of N decoder layers.
+
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoder
+
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            dec_output = decoder(dec_input,
+                                 enc_output,
+                                 self_attn_bias,
+                                 cross_attn_bias)  # [2, 4, 128]
     """
 
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
         super(TransformerDecoder, self).__init__()
 
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+
         self.decoder_layers = list()
         for i in range(n_layer):
             self.decoder_layers.append(
@@ -1536,18 +3407,67 @@ def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
     def forward(self,
                 dec_input,
                 enc_output,
-                self_attn_bias,
-                cross_attn_bias,
+                self_attn_bias=None,
+                cross_attn_bias=None,
                 caches=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs.
+
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(list, optional): It caches the multi-head attention intermediate results
+                of history decoding steps and encoder output. It is a list of dict
+                where the length of list is decoder layer number, and each dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+
+        Returns:
+            Variable: The output of Transformer decoder. It is a tensor that has \
+                the same shape and data type as `dec_input`.
+        """
         for i, decoder_layer in enumerate(self.decoder_layers):
             dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
+                                       cross_attn_bias, caches[i]
+                                       if caches else None)
             dec_input = dec_output
 
         return self.processer(dec_output)
 
     def prepare_static_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `static_k`, `statkc_v` as keys, and values are projected
+        results of encoder output to be used as keys and values in decoder-encoder
+        cross (multi-head) attention. Used in inference.
+
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \
+                and values are projected results of encoder output to be used as \
+                keys and values in decoder-encoder cross (multi-head) attention.
+        """
         return [
             dict(
                 zip(("static_k", "static_v"),
@@ -1555,110 +3475,116 @@ def prepare_static_cache(self, enc_output):
             for decoder_layer in self.decoder_layers
         ]
 
+    def prepare_incremental_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `k`, `v` as keys, and values are empty tensors with shape
+        `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`,
+        representing the decoder self (multi-head) attention intermediate results,
+        and 0 is the initial length which would increase as inference decoding
+        continued. Used in inference.
 
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUCell(RNNCell):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False):
-        super(GRUCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.fc_layer = Linear(
-            input_size, hidden_size * 3, param_attr=param_attr)
-
-        self.gru_unit = GRUUnit(
-            hidden_size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-    def forward(self, inputs, states):
-        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.fc_layer(inputs)
-        hidden, _, _ = self.gru_unit(x, states)
-        return hidden, hidden
-
-    @property
-    def state_shape(self):
-        return [self.hidden_size]
-
-
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUEncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(GRUEncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.gru_cells = []
-        for i in range(num_layers):
-            self.gru_cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    #BasicGRUCell(
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.UniformInitializer(
-                                low=-init_scale, high=init_scale)))))
-
-    def forward(self, step_input, states):
-        new_states = []
-        for i, gru_cell in enumerate(self.gru_cells):
-            out, state = gru_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout_prob,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout_prob > 0 else out
-            new_states.append(step_input)
-        return step_input, new_states
-
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.gru_cells]
-
-
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                      init_bound),
-                       is_reverse=False,
-                       time_major=False)
-
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                        init_bound),
-                         is_reverse=True,
-                         time_major=False)
-
-    def forward(self, input_feature):
-        pre_gru, pre_state = self.gru(input_feature)
-        gru_r, r_state = self.gru_r(input_feature)
-        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-        return bi_merge
-
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64. Actually, it is used to provide batch
+                size for Transformer initial states(caches), thus any tensor has
+                wanted batch size can be used here.
 
-class Linear_chain_crf(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
+        Returns:
+            list: A list of dict. Each dict has `k`, `v` as keys, and values are \
+                empty tensors representing intermediate results of history decoding \
+                steps in decoder self (multi-head) attention at time step 0.
+        """
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
+
+class LinearChainCRF(Layer):
+    """
+    Computes the negtive log-likelihood of tag sequences in a linear chain CRF. 
+    Using terminologies of undirected probabilistic graph model, it calculates
+    probability using unary potentials (for emission) and binary potentials 
+    (for transition). 
+
+    This layer creates a learnable parameter shaped `[size + 2, size]` (`size`
+    is for the number of tags), where:
+    
+    1. the first row is for starting weights, denoted as $a$ here
+    
+    2. the second row is for ending weights, denoted as $b$ here.
+    
+    3. the remaining rows is a matrix for transition weights. 
+    
+    Denote input tensor of unary potentials(emission) as $x$ , then the probability
+    of a tag sequence $s$ of length $L$ is defined as:
+
+    $$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                    + \sum_{l=1}^L x_{s_l}
+                    + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+    
+    where $Z$ is a normalization value so that the sum of $P(s)$ over
+    all possible sequences is 1, and $x$ is the emission feature weight
+    to the linear chain CRF.
+
+    This operator implements the Forward-Backward algorithm for the linear chain
+    CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+    http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
+
+    NOTE:
+
+    1. The feature function for a CRF is made up of the emission features and the
+    transition features. The emission feature weights are NOT computed in
+    this operator. They MUST be computed first before this operator is called.
+
+    2. Because this operator performs global normalization over all possible
+    sequences internally, it expects UNSCALED emission feature weights.
+    Please do not call this op with the emission feature being output of any
+    nonlinear activation.
+
+    3. The 2nd dimension of input(emission) MUST be equal to the tag number.
+
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LinearChainCRF
+
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5))
+            # label: [batch_size, sequence_length, num_tags]
+            # dummy label just for example usage
+            label = paddle.ones((2, 8), dtype='int64')  
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            crf = LinearChainCRF(size=5)
+            cost = crf(emission, label, length)  # [2, 1]
+    """
 
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
         self._param_attr = param_attr
         self._dtype = dtype
         self._size = size
-        self._is_test = is_test
         self._transition = self.create_parameter(
             attr=self._param_attr,
             shape=[self._size + 2, self._size],
@@ -1666,14 +3592,46 @@ def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
 
     @property
     def weight(self):
+        """
+        getter for transition matrix parameter
+
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         return self._transition
 
     @weight.setter
     def weight(self, value):
+        """
+        setter for transition matrix parameter
+
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         self._transition = value
 
-    def forward(self, input, label, length=None):
+    def forward(self, input, label, length):
+        """
+        Computes the log-likelihood of tag sequences in a linear chain CRF.
+
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            label (Variable): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64.
+            length (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence for correctness.
 
+        Returns:
+            Variable: The negtive log-likelihood of tag sequences. It is a tensor \
+                with shape `[batch_size, 1]` and has float32 or float64 data type.
+        """
         alpha = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
         emission_exps = self._helper.create_variable_for_type_inference(
@@ -1697,18 +3655,60 @@ def forward(self, input, label, length=None):
                 "EmissionExps": [emission_exps],
                 "TransitionExps": transition_exps,
                 "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
+            })
         return log_likelihood
 
 
-class Crf_decoding(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
+class CRFDecoding(Layer):
+    """
+    CRFDecoding reads the emission feature weights and the transition
+    feature weights learned by the `LinearChainCRF` and performs decoding. 
+    It implements the Viterbi algorithm which is a dynamic programming algorithm 
+    for finding the most likely sequence of hidden states, called the Viterbi path, 
+    that results in a sequence of observed tags.
+
+    The output of this layer changes according to whether `label` is given:
+
+    1. `label` is given:
+
+    This happens in training. This operator is used to co-work with the chunk_eval
+    operator. When `label` is given, it returns tensor with the same shape as 
+    `label` whose values are fixed to be 0, indicating an incorrect prediction,
+    or 1 indicating a tag is correctly predicted. Such an output is the input to
+    chunk_eval operator.
+
+    2. `label` is not given:
+
+    This is the standard decoding process and get the highest scoring sequence
+    of tags.
+
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+
+    Examples:
+
+        .. code-block:: python
 
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CRFDecoding
+
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5))
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            crf_decoding = CRFDecoding(size=5)
+            cost = crf_decoding(emission, length)  # [2, 8]
+    """
+
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(CRFDecoding, self).__init__()
         self._dtype = dtype
         self._size = size
-        self._is_test = is_test
         self._param_attr = param_attr
         self._transition = self.create_parameter(
             attr=self._param_attr,
@@ -1717,13 +3717,49 @@ def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
 
     @property
     def weight(self):
+        """
+        getter for transition matrix parameter
+
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         return self._transition
 
     @weight.setter
     def weight(self, value):
+        """
+        setter for transition matrix parameter
+
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
         self._transition = value
 
-    def forward(self, input, label=None, length=None):
+    def forward(self, input, length, label=None):
+        """
+        Performs sequence tagging prediction.
+
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            length (Variable): A tensor with shape `[batch_size]`.
+                It stores real length of each sequence for correctness.
+            label (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. Default None.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size, sequence_length]` and \
+                int64 data type. If `label` is None, the tensor has binary values \
+                indicating a correct or incorrect prediction. Otherwise its values \
+                range from 0 to maximum tag number - 1, each element indicates \
+                an index of a predicted tag.
+        """
 
         viterbi_path = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
@@ -1737,21 +3773,22 @@ def forward(self, input, label=None, length=None):
         self._helper.append_op(
             type='crf_decoding',
             inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
+            outputs={"ViterbiPath": [viterbi_path]})
         return viterbi_path
 
 
-class GRUEncoderLayer(Layer):
+class _GRUEncoder(Layer):
+    """
+    A multi-layer bidirectional GRU encoder used by SequenceTagging.
+    """
+
     def __init__(self,
                  input_dim,
                  grnn_hidden_dim,
                  init_bound,
                  num_layers=1,
-                 h_0=None,
                  is_bidirection=False):
-        super(GRUEncoderLayer, self).__init__()
-        self.h_0 = h_0
+        super(_GRUEncoder, self).__init__()
         self.num_layers = num_layers
         self.is_bidirection = is_bidirection
         self.gru_list = []
@@ -1788,7 +3825,7 @@ def __init__(self,
                             is_reverse=True,
                             time_major=False)))
 
-    def forward(self, input_feature):
+    def forward(self, input_feature, h0=None):
         for i in range(self.num_layers):
             pre_gru, pre_state = self.gru_list[i](input_feature)
             if self.is_bidirection:
@@ -1800,28 +3837,55 @@ def forward(self, input_feature):
         return out
 
 
-class SequenceTagging(fluid.dygraph.Layer):
+class SequenceTagging(Layer):
+    """
+    Sequence tagging model using multi-layer bidirectional GRU as backbone and
+    linear chain CRF as output layer.
+
+    Parameters:
+        vocab_size (int): The size of vocabulary.
+        num_labels (int): The number of labels.
+        word_emb_dim (int, optional): The embedding size. Defalut 128
+        grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128
+        emb_learning_rate (int, optional): The partial learning rate for embedding.
+            The actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        crf_learning_rate (int, optional): The partial learning rate for crf. The
+            actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        bigru_num (int, optional): The number of bidirectional GRU layers.
+            Default 2
+        init_bound (float, optional): The range for uniform initializer would
+            be `(-init_bound, init_bound)`. It would be used for all parameters
+            except CRF transition matrix. Default 0.1
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import SequenceTagging
+
+            # word: [batch_size, sequence_length]
+            # dummy input just for example
+            word = paddle.ones((2, 8), dtype='int64')
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
+            outputs = seq_tagger(word, length)
+    """
+
     def __init__(self,
                  vocab_size,
                  num_labels,
-                 batch_size,
                  word_emb_dim=128,
                  grnn_hidden_dim=128,
                  emb_learning_rate=0.1,
                  crf_learning_rate=0.1,
                  bigru_num=2,
-                 init_bound=0.1,
-                 length=None):
+                 init_bound=0.1):
         super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
         self.word_emb_dim = word_emb_dim
         self.vocab_size = vocab_size
         self.num_labels = num_labels
@@ -1829,7 +3893,6 @@ def __init__(self,
         self.emb_lr = emb_learning_rate
         self.crf_lr = crf_learning_rate
         self.bigru_num = bigru_num
-        self.batch_size = batch_size
         self.init_bound = 0.1
 
         self.word_embedding = Embedding(
@@ -1841,20 +3904,11 @@ def __init__(self,
                 initializer=fluid.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound)))
 
-        h_0 = fluid.layers.create_global_var(
-            shape=[self.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-
-        self.gru_encoder = GRUEncoderLayer(
+        self.gru_encoder = _GRUEncoder(
             input_dim=self.grnn_hidden_dim,
             grnn_hidden_dim=self.grnn_hidden_dim,
             init_bound=self.init_bound,
             num_layers=self.bigru_num,
-            h_0=h_0,
             is_bidirection=True)
 
         self.fc = Linear(
@@ -1866,19 +3920,39 @@ def __init__(self,
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
-        self.linear_chain_crf = Linear_chain_crf(
+        self.linear_chain_crf = LinearChainCRF(
             param_attr=fluid.ParamAttr(
                 name='linear_chain_crfw', learning_rate=self.crf_lr),
             size=self.num_labels)
 
-        self.crf_decoding = Crf_decoding(
+        self.crf_decoding = CRFDecoding(
             param_attr=fluid.ParamAttr(
                 name='crfw', learning_rate=self.crf_lr),
             size=self.num_labels)
 
     def forward(self, word, lengths, target=None):
         """
-        Configure the network
+        Performs sequence tagging. If `target` is None, it is for training and
+        loss would be returned, otherwise it is for inference and returns the
+        predicted tags.
+
+        Parameters:
+            word (Variable): The input sequences to be labeled. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type should
+                be int64.
+            lengths (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence.
+            target (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. It could be None for inference. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \
+                argument `target` is provided, including the most likely sequence \
+                tags, the averaged CRF cost and the sequence lengths, the shapes \
+                are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \
+                and the data types are int64, float32 and int64. Otherwise A \
+                tuple( :code:`(crf_decode, lengths)` ) for inference.
         """
         word_embed = self.word_embedding(word)
         input_feature = word_embed