From 06fd61f8b514dcc218831402e3bcdbf8c82ef719 Mon Sep 17 00:00:00 2001 From: jinyuKing <2943829328@qq.com> Date: Mon, 27 Apr 2020 10:11:58 +0000 Subject: [PATCH 1/7] update text.py --- hapi/text/text.py | 98 ++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index 1228d9f..5ff5d1c 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -19,7 +19,6 @@ import os import six import sys - if six.PY2: reload(sys) sys.setdefaultencoding('utf8') @@ -50,8 +49,8 @@ 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf', - 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer', 'CNNEncoder', - 'BOWEncoder', 'SimpleConvPoolLayer', 'GRUEncoder', 'DynamicGRU', 'LSTMEncoder' + 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer', 'SimCNNEncoder', + 'SimBOWEncoder', 'SimpleConvPoolLayer', 'SimGRUEncoder', 'DynamicGRU', 'SimLSTMEncoder' ] @@ -89,12 +88,12 @@ def get_initial_states(self, batch_ref = flatten(batch_ref)[0] def _is_shape_sequence(seq): - if sys.version_info < (3,): + if sys.version_info < (3, ): integer_types = ( int, - long,) + long, ) else: - integer_types = (int,) + integer_types = (int, ) """For shape, list/tuple of integer is the finest-grained objection""" if (isinstance(seq, list) or isinstance(seq, tuple)): if reduce( @@ -249,8 +248,8 @@ def __init__(self, self.use_customized_weight = False for _weights in [ - forget_gate_weights, input_gate_weights, output_gate_weights, - cell_weights + forget_gate_weights, input_gate_weights, output_gate_weights, + cell_weights ]: for _key in _weights: if _weights[_key] is not None: @@ -275,7 +274,7 @@ def __init__(self, is_bias=True) else: if "w" in forget_gate_weights and forget_gate_weights[ - "w"] is not None: + "w"] is not None: self.fg_w = forget_gate_weights["w"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -289,7 +288,7 @@ def __init__(self, dtype=self._dtype) if "h" in forget_gate_weights and forget_gate_weights[ - "h"] is not None: + "h"] is not None: self.fg_h = forget_gate_weights["h"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -303,7 +302,7 @@ def __init__(self, dtype=self._dtype) if "b" in forget_gate_weights and forget_gate_weights[ - "b"] is not None: + "b"] is not None: self.fg_b = forget_gate_weights["b"] else: if self._bias_attr is not None and self._bias_attr.name is not None: @@ -318,7 +317,7 @@ def __init__(self, is_bias=True) if "w" in input_gate_weights and input_gate_weights[ - "w"] is not None: + "w"] is not None: self.ig_w = input_gate_weights["w"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -333,7 +332,7 @@ def __init__(self, dtype=self._dtype) if "h" in input_gate_weights and input_gate_weights[ - "h"] is not None: + "h"] is not None: self.ig_h = input_gate_weights["h"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -348,7 +347,7 @@ def __init__(self, dtype=self._dtype) if "b" in input_gate_weights and input_gate_weights[ - "b"] is not None: + "b"] is not None: self.ig_b = input_gate_weights["b"] else: if self._bias_attr is not None and self._bias_attr.name is not None: @@ -363,7 +362,7 @@ def __init__(self, is_bias=True) if "w" in output_gate_weights and output_gate_weights[ - "w"] is not None: + "w"] is not None: self.og_w = output_gate_weights["w"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -377,7 +376,7 @@ def __init__(self, dtype=self._dtype) if "h" in output_gate_weights and output_gate_weights[ - "h"] is not None: + "h"] is not None: self.og_h = output_gate_weights["h"] else: if self._param_attr is not None and self._param_attr.name is not None: @@ -392,7 +391,7 @@ def __init__(self, dtype=self._dtype) if "b" in output_gate_weights and output_gate_weights[ - "b"] is not None: + "b"] is not None: self.og_b = output_gate_weights["b"] else: if self._bias_attr is not None and self._bias_attr.name is not None: @@ -547,7 +546,7 @@ def __init__(self, self.use_customized_weight = False for _weights in [ - update_gate_weights, reset_gate_weights, cell_weights + update_gate_weights, reset_gate_weights, cell_weights ]: for _key in _weights: if _weights[_key] is not None: @@ -603,7 +602,7 @@ def __init__(self, # create the parameters of gates in gru if "w" in update_gate_weights and update_gate_weights[ - "w"] is not None: + "w"] is not None: self.ug_w = update_gate_weights["w"] else: if gate_param_attr is not None and gate_param_attr.name is not None: @@ -617,7 +616,7 @@ def __init__(self, dtype=self._dtype) if "h" in update_gate_weights and update_gate_weights[ - "h"] is not None: + "h"] is not None: self.ug_h = update_gate_weights["h"] else: if gate_param_attr is not None and gate_param_attr.name is not None: @@ -631,7 +630,7 @@ def __init__(self, dtype=self._dtype) if "b" in update_gate_weights and update_gate_weights[ - "b"] is not None: + "b"] is not None: self.ug_b = update_gate_weights["b"] else: if gate_bias_attr is not None and gate_bias_attr.name is not None: @@ -647,7 +646,7 @@ def __init__(self, # reset gate parameters if "w" in reset_gate_weights and reset_gate_weights[ - "w"] is not None: + "w"] is not None: self.rg_w = reset_gate_weights["w"] else: if gate_param_attr is not None and gate_param_attr.name is not None: @@ -661,7 +660,7 @@ def __init__(self, dtype=self._dtype) if "h" in reset_gate_weights and reset_gate_weights[ - "h"] is not None: + "h"] is not None: self.rg_h = reset_gate_weights["h"] else: if gate_param_attr is not None and gate_param_attr.name is not None: @@ -675,7 +674,7 @@ def __init__(self, dtype=self._dtype) if "b" in reset_gate_weights and reset_gate_weights[ - "b"] is not None: + "b"] is not None: self.rg_b = reused_params["b"] else: if gate_bias_attr is not None and gate_bias_attr.name is not None: @@ -803,7 +802,7 @@ def _maybe_copy(state, new_state, step_mask): new_state = fluid.layers.elementwise_mul( new_state, step_mask, axis=0) - fluid.layers.elementwise_mul( - state, (step_mask - 1), axis=0) + state, (step_mask - 1), axis=0) return new_state flat_inputs = flatten(inputs) @@ -849,8 +848,8 @@ def _maybe_copy(state, new_state, step_mask): outputs = map_structure( lambda x: ArrayWrapper(x), step_outputs) if i == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) + lambda x, x_array: x_array.append(x), step_outputs, + outputs) final_outputs = map_structure( lambda x: fluid.layers.stack(x.array, @@ -919,7 +918,7 @@ def _maybe_copy(state, new_state, step_mask): step_mask.stop_gradient = True new_state = layers.elementwise_mul( state, step_mask, axis=0) - layers.elementwise_mul( - new_state, (step_mask - 1), axis=0) + new_state, (step_mask - 1), axis=0) if convert_dtype(state_dtype) in ["bool"]: new_state = layers.cast(new_state, dtype=state_dtype) return new_state @@ -961,8 +960,8 @@ def _maybe_copy(state, new_state, step_mask): outputs = map_structure( lambda x: ArrayWrapper(x), step_outputs) if step_idx == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) + lambda x, x_array: x_array.append(x), step_outputs, + outputs) inputs, states, finished, sequence_lengths = ( next_inputs, next_states, next_finished, next_sequence_lengths) @@ -991,7 +990,7 @@ def _maybe_copy(state, new_state, step_mask): return (final_outputs, final_states, sequence_lengths) if self.return_length else ( - final_outputs, final_states) + final_outputs, final_states) else: return fluid.layers.dynamic_decode( self.decoder, @@ -1042,7 +1041,7 @@ def _merge_batch_beams_with_var_dim(self, x): x = layers.reshape( x, [0] * (len(x.shape) - var_dim_in_state ) + [self.batch_size * self.beam_size] + - [int(size) for size in x.shape[-var_dim_in_state + 2:]]) + [int(size) for size in x.shape[-var_dim_in_state + 2:]]) x = layers.transpose( x, list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) + @@ -1053,9 +1052,9 @@ def _split_batch_beams_with_var_dim(self, x): var_dim_size = layers.shape(x)[self.var_dim_in_state] x = layers.reshape( x, [-1, self.beam_size] + - [int(size) - for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + - [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) + [int(size) + for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + + [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) return x def step(self, time, inputs, states, **kwargs): @@ -1118,7 +1117,7 @@ def __init__(self, elif cmd == "d": # add dropout self.functors.append(lambda x: layers.dropout( x, dropout_prob=dropout_rate, is_test=False) - if dropout_rate else x) + if dropout_rate else x) def forward(self, x, residual=None): for i, cmd in enumerate(self.process_cmd): @@ -1219,7 +1218,7 @@ def forward(self, queries, keys, values, attn_bias, cache=None): # scale dot product attention product = layers.matmul( - x=q, y=k, transpose_y=True, alpha=self.d_model ** -0.5) + x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) if attn_bias: product += attn_bias weights = layers.softmax(product) @@ -1309,6 +1308,7 @@ def __init__(self, reused_ffn_weights={"reused_fc1": None, "reused_fc2": None}, reused_post_ffn_layernorm=None): + super(TransformerEncoderLayer, self).__init__() self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, @@ -1556,7 +1556,7 @@ def prepare_static_cache(self, enc_output): ] -# TODO: we should merge GRUCell with BasicGRUCell +#TODO: we should merge GRUCell with BasicGRUCell class GRUCell(RNNCell): def __init__(self, input_size, @@ -1590,7 +1590,7 @@ def state_shape(self): return [self.hidden_size] -# TODO: we should merge GRUCell with BasicGRUCell +#TODO: we should merge GRUCell with BasicGRUCell class GRUEncoderCell(RNNCell): def __init__(self, num_layers, @@ -1606,7 +1606,7 @@ def __init__(self, self.gru_cells.append( self.add_sublayer( "gru_%d" % i, - # BasicGRUCell( + #BasicGRUCell( GRUCell( input_size=input_size if i == 0 else hidden_size, hidden_size=hidden_size, @@ -1673,6 +1673,7 @@ def weight(self, value): self._transition = value def forward(self, input, label, length=None): + alpha = self._helper.create_variable_for_type_inference( dtype=self._dtype) emission_exps = self._helper.create_variable_for_type_inference( @@ -1723,6 +1724,7 @@ def weight(self, value): self._transition = value def forward(self, input, label=None, length=None): + viterbi_path = self._helper.create_variable_for_type_inference( dtype=self._dtype) this_inputs = { @@ -1919,7 +1921,7 @@ def forward(self, input): return x -class CNNEncoder(Layer): +class SimCNNEncoder(Layer): """ simple CNNEncoder for simnet """ @@ -1933,7 +1935,7 @@ def __init__(self, padding_idx, act ): - super(CNNEncoder, self).__init__() + super(SimCNNEncoder, self).__init__() self.dict_size = dict_size self.emb_dim = emb_dim self.filter_size = filter_size @@ -1962,7 +1964,7 @@ def forward(self, input): emb_out=self.cnn_layer(emb_reshape) return emb_out -class BOWEncoder(Layer): +class SimBOWEncoder(Layer): """ simple BOWEncoder for simnet """ @@ -1973,7 +1975,7 @@ def __init__(self, seq_len, padding_idx ): - super(BOWEncoder, self).__init__() + super(SimBOWEncoder, self).__init__() self.dict_size = dict_size self.bow_dim = bow_dim self.seq_len = seq_len @@ -2034,7 +2036,7 @@ def forward(self, inputs): res = fluid.layers.concat(res, axis=1) return res -class GRUEncoder(Layer): +class SimGRUEncoder(Layer): """ simple GRUEncoder for simnet """ @@ -2046,7 +2048,7 @@ def __init__(self, padding_idx, seq_len ): - super(GRUEncoder, self).__init__() + super(SimGRUEncoder, self).__init__() self.dict_size = dict_size self.emb_dim = emb_dim self.gru_dim = gru_dim @@ -2071,7 +2073,7 @@ def forward(self, input): gru = fluid.layers.tanh(gru) return gru -class LSTMEncoder(Layer): +class SimLSTMEncoder(Layer): """ simple LSTMEncoder for simnet """ @@ -2087,7 +2089,7 @@ def __init__(self, """ initialize """ - super(LSTMEncoder, self).__init__() + super(SimLSTMEncoder, self).__init__() self.dict_size = dict_size self.emb_dim = emb_dim self.lstm_dim = lstm_dim From 75dcc1612da18040d4df65cf36d9a4db222a1e6b Mon Sep 17 00:00:00 2001 From: jinyuKing <2943829328@qq.com> Date: Wed, 6 May 2020 09:58:16 +0000 Subject: [PATCH 2/7] update text.py --- hapi/text/text.py | 370 ++++++++++++++++++++++------------------------ 1 file changed, 173 insertions(+), 197 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index 5ff5d1c..9fec48c 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -37,7 +37,7 @@ import paddle.fluid as fluid import paddle.fluid.layers.utils as utils from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as -from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit, Conv2D +from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D from paddle.fluid.data_feeder import convert_dtype from paddle.fluid import layers @@ -49,8 +49,8 @@ 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf', - 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer', 'SimCNNEncoder', - 'SimBOWEncoder', 'SimpleConvPoolLayer', 'SimGRUEncoder', 'DynamicGRU', 'SimLSTMEncoder' + 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer', 'Conv1dPoolLayer', + 'CNNEncoder' ] @@ -1898,226 +1898,202 @@ def forward(self, word, lengths, target=None): crf_decode = self.crf_decoding(input=emission, length=lengths) return crf_decode, lengths -class SimpleConvPoolLayer(Layer): +class Conv1dPoolLayer(Layer): + """ + This interface is used to construct a callable object of the ``Conv1DPoolLayer`` class.The ``Conv1DPoolLayer`` is composed of a ``Conv2D`` and a ``Pool2D`` . + For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates the output based on the input, filter and strides, paddings, dilations, + groups,global_pooling, pool_type,ceil_mode,exclusive parameters.Input and Output are in NCH format, where N is batch size, C is the number of the feature map, + H is the height of the feature map.The data type of Input data and Output data is 'float32' or 'float64'. + + Args: + input(Variable):3-D Tensor, shape is [N, C, H], data type can be float32 or float64 + num_channels(int): The number of channels in the input data. + num_filters(int): The number of filters. It is the same as the output channels. + filter_size (int): The filter size of Conv1DPoolLayer. + pool_size (int): The pooling size of Conv1DPoolLayer. + conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer. Default: 1 + pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer. Default: 1 + conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer. Default: 0 + pool_padding (int): The padding of pool layer in Conv1DPoolLayer. Default: 0 + pool_type (str): Pooling type can be `max` for max-pooling or `avg` for average-pooling. Default: math:`max` + global_pooling (bool): Whether to use the global pooling. If global_pooling = true, pool_size and pool_padding while be ignored. Default: False + dilation (int): The dilation size of the conv Layer. Default: 1. + groups (int): The groups number of the conv Layer. According to grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: 1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of conv layer. If it is set to None or one attribute of + ParamAttr, conv2d will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized + with :`Normal(0.0, std)`,and the :`std` is :`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv.If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not + set, the bias is initialized zero. Default: None. + name(str, optional): The default value is None. Normally there is no need for user to set this property. Default: None + act (str): Activation type for conv layer, if it is set to None, activation is not appended. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: False + ceil_mode (bool, optional): Whether to use the ceil function to calculate output height and width. + False is the default. If it is set to False, the floor function will be used. Default: False. + exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True. + + Return: + 3-D Tensor, the result of input after conv and pool, with the same data type as :`input` + + Return Type: + Variable + + Example: + ```python + import paddle.fluid as fluid + from hapi.text import Conv1dPoolLayer + + test=np.random.uniform(-1,1,[2,3,4]).astype('float32') + with fluid.dygraph.guard(): + paddle_input=to_variable(test) + print(paddle_input.shape) + cov2d=Conv1dPoolLayer(3,4,2,2) + paddle_out=cov2d(paddle_input) + print(paddle_out.shape)#[2,4,2] + + ``` + """ def __init__(self, num_channels, num_filters, filter_size, + pool_size, + conv_stride=1, + pool_stride=1, + conv_padding=0, + pool_padding=0, + pool_type='max', + global_pooling=False, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + act=None, use_cudnn=False, - act=None + ceil_mode=False, + exclusive=True, ): - super(SimpleConvPoolLayer, self).__init__() + super(Conv1dPoolLayer, self).__init__() self._conv2d = Conv2D(num_channels=num_channels, num_filters=num_filters, - filter_size=filter_size, - padding=[1, 1], + filter_size=[filter_size,1], + stride=[conv_stride,1], + padding=[conv_padding,0], + dilation=[dilation,1], + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, use_cudnn=use_cudnn, act=act) - - def forward(self, input): - x = self._conv2d(input) - x = fluid.layers.reduce_max(x, dim=-1) - x = fluid.layers.reshape(x, shape=[x.shape[0], -1]) + self._pool2d = Pool2D(pool_size=[pool_size,1], + pool_type=pool_type, + pool_stride=[pool_stride,1], + pool_padding=[pool_padding,0], + global_pooling=global_pooling, + use_cudnn=use_cudnn, + ceil_mode=ceil_mode, + exclusive=exclusive + ) + def forward(self, inputs): + x = fluid.layers.unsqueeze(inputs,axes=[-1]) + x = self._conv2d(x) + x = self._pool2d(x) + x = fluid.layers.squeeze(x, axes=[-1]) return x -class SimCNNEncoder(Layer): + + +class CNNEncoder(Layer): """ - simple CNNEncoder for simnet + This interface is used to construct a callable object of the ``CNNEncoder`` class.The ``CNNEncoder`` is composed of a ``Embedding`` and a ``Conv1dPoolLayer`` . + For more details, refer to code examples. The ``CNNEncoder`` layer calculates the output based on the input, dict_size and emb_dim, filter_size, num_filters, + use_cuda, is_sparse, param_attr parameters. The type of Input data is a Tensor or a lod-tensor .The data type of Input data is 'int64'. Output data are in NCH + format, where N is batch size, C is the number of the feature map, H is the height of the feature map. The data type of Output data is 'float32' or 'float64'. + + Args: + dict_size(int): the size of the dictionary of embeddings + emb_szie(int): the size of each embedding vector respectively. + num_channels(int): The number of channels in the input data.Default:1 + num_filters(int): The number of filters. It is the same as the output channels. + filter_size(int): The filter size of Conv1DPoolLayer in CNNEncoder. + pool_size(int): The pooling size of Conv1DPoolLayer in CNNEncoder. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: False + is_sparse(bool): The flag indicating whether to use sparse update. This parameter only affects the performance of the backwards gradient update. It is recommended + to set True because sparse update is faster. But some optimizer does not support sparse update,such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , + :ref:`api_fluid_optimizer_AdamaxOptimizer` , :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` , + :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` . + In these case, is_sparse must be False. Default: True. + param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in + :ref:`api_fluid_ParamAttr` . In addition,user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. The local word vector + needs to be transformed into numpy format, and the shape of local word vector should be consistent with :attr:`size` . + Then :ref:`api_fluid_initializer_NumpyArrayInitializer` is used to load custom or pre-trained word vectors. Default: None. + padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). + If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted to :math:`vocab\_size + padding\_idx` . It will output all-zero padding + data whenever lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated while training. If set None, it makes no effect to + output. Default: None. + act (str): Activation type for `Conv1dPoollayer` layer, if it is set to None, activation is not appended. Default: None. + + Return: + 3-D Tensor, the result of input after embedding and conv1dPoollayer + + Return Type: + Variable + + Example: + ```python + import paddle.fluid as fluid + from hapi.text import CNNEncoder + + test=np.random.uniform(1,5,[2,3,4]).astype('int64') + with fluid.dygraph.guard(): + paddle_input=to_variable(test) + print(paddle_input.shape) + cov2d=CNNEncoder(128,4,3,4,2,2) + paddle_out=cov2d(paddle_input) + print(paddle_out.shape)#[8,4,2] + + ``` + """ def __init__(self, dict_size, - emb_dim, - filter_size, + emb_size, + num_channels, num_filters, - hidden_dim, - seq_len, - padding_idx, - act + filter_size, + pool_size, + use_cuda=False, + is_sparse=True, + param_attr=None, + padding_idx=None, + act=None ): - super(SimCNNEncoder, self).__init__() + super(CNNEncoder, self).__init__() self.dict_size = dict_size - self.emb_dim = emb_dim + self.emb_size = emb_size self.filter_size = filter_size self.num_filters = num_filters - self.hidden_dim = hidden_dim - self.seq_len = seq_len - self.padding_idx = padding_idx - self.act = act - self.channels = 1 - self.emb_layer = Embedding(size=[self.dict_size, self.emb_dim], - is_sparse=True, - padding_idx=self.padding_idx, - param_attr=fluid.ParamAttr(name='emb', initializer=fluid.initializer.Xavier())) - self.cnn_layer = SimpleConvPoolLayer( + self.pool_size = pool_size + self.channels = num_channels + self._emb_layer = Embedding(size=[self.dict_size, self.emb_size], + is_sparse=is_sparse, + padding_idx=padding_idx, + param_attr=param_attr) + self._cnn_layer = Conv1dPoolLayer( self.channels, self.num_filters, self.filter_size, - use_cudnn=False, - act=self.act - ) - - def forward(self, input): - emb = self.emb_layer(input) - emb_reshape = fluid.layers.reshape( - emb, shape=[-1, self.channels, self.seq_len, self.hidden_dim]) - emb_out=self.cnn_layer(emb_reshape) - return emb_out - -class SimBOWEncoder(Layer): - """ - simple BOWEncoder for simnet - """ - def __init__(self, - dict_size, - emb_dim, - bow_dim, - seq_len, - padding_idx - ): - super(SimBOWEncoder, self).__init__() - self.dict_size = dict_size - self.bow_dim = bow_dim - self.seq_len = seq_len - self.emb_dim = emb_dim - self.padding_idx=padding_idx - self.emb_layer = Embedding(size=[self.dict_size, self.emb_dim], - is_sparse=True, - padding_idx=self.padding_idx, - param_attr=fluid.ParamAttr(name='emb', initializer=fluid.initializer.Xavier())) - - def forward(self, input): - emb = self.emb_layer(input) - emb_reshape = fluid.layers.reshape( - emb, shape=[-1, self.seq_len, self.bow_dim]) - bow_emb = fluid.layers.reduce_sum(emb_reshape, dim=1) - return bow_emb - -class DynamicGRU(fluid.dygraph.Layer): - def __init__(self, - size, - h_0=None, - param_attr=None, - bias_attr=None, - is_reverse=False, - gate_activation='sigmoid', - candidate_activation='tanh', - origin_mode=False, - init_size=None): - super(DynamicGRU, self).__init__() - - self.gru_unit = GRUUnit( - size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode) - - self.size = size - self.h_0 = h_0 - self.is_reverse = is_reverse - - def forward(self, inputs): - hidden = self.h_0 - res = [] - for i in range(inputs.shape[1]): - if self.is_reverse: - i = inputs.shape[1] - 1 - i - input_ = inputs[:, i:i + 1, :] - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) - hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) - res.append(hidden_) - if self.is_reverse: - res = res[::-1] - res = fluid.layers.concat(res, axis=1) - return res - -class SimGRUEncoder(Layer): - """ - simple GRUEncoder for simnet - """ - def __init__(self, - dict_size, - emb_dim, - gru_dim, - hidden_dim, - padding_idx, - seq_len - ): - super(SimGRUEncoder, self).__init__() - self.dict_size = dict_size - self.emb_dim = emb_dim - self.gru_dim = gru_dim - self.seq_len=seq_len - self.hidden_dim = hidden_dim - self.padding_idx=self.padding_idx - self.emb_layer = Embedding(size=[self.dict_size, self.emb_dim], - is_sparse=True, - padding_idx=self.padding_idx, - param_attr=fluid.ParamAttr(name='emb', - initializer=fluid.initializer.Xavier())) - self.gru_layer = DynamicGRU(self.gru_dim) - self.proj_layer = Linear(input_dim=self.hidden_dim, output_dim=self.gru_dim * 3) - - def forward(self, input): - emb = self.emb_layer(input) - emb_proj = self.proj_layer(emb) - h_0 = np.zeros((emb_proj.shape[0], self.hidden_dim), dtype="float32") - h_0 = to_variable(h_0) - gru = self.gru_layer(emb_proj, h_0=h_0) - gru = fluid.layers.reduce_max(gru, dim=1) - gru = fluid.layers.tanh(gru) - return gru - -class SimLSTMEncoder(Layer): - """ - simple LSTMEncoder for simnet - """ - def __init__(self, - dict_size, - emb_dim, - lstm_dim, - hidden_dim, - seq_len, - padding_idx, - is_reverse - ): - """ - initialize - """ - super(SimLSTMEncoder, self).__init__() - self.dict_size = dict_size - self.emb_dim = emb_dim - self.lstm_dim = lstm_dim - self.hidden_dim = hidden_dim - self.seq_len = seq_len - self.is_reverse = False - self.padding_idx=padding_idx - - self.emb_layer = Embedding(size=[self.dict_size, self.emb_dim], - is_sparse=True, - padding_idx=self.padding_idx, - param_attr=fluid.ParamAttr(name='emb', initializer=fluid.initializer.Xavier())) - - self.lstm_cell = BasicLSTMCell( - hidden_size=self.lstm_dim, input_size=self.lstm_dim * 4 - ) - self.lstm_layer = RNN( - cell=self.lstm_cell, time_major=True, is_reverse=self.is_reverse + self.pool_size, + use_cudnn=use_cuda, + act=act ) - self.proj_layer = Linear(input_dim=self.hidden_dim, output_dim=self.lstm_dim * 4) def forward(self, input): - emb = self.emb_layer(input) - emb_proj = self.proj_layer(emb) - emb_lstm, _ = self.lstm_layer(emb_proj) - emb_reduce = fluid.layers.reduce_max(emb_lstm, dim=1) + emb = self._emb_layer(input) emb_reshape = fluid.layers.reshape( - emb_reduce, shape=[-1, self.seq_len, self.hidden_dim]) - emb_lstm = fluid.layers.reduce_sum(emb_reshape, dim=1) - emb_last = fluid.layers.tanh(emb_lstm) - return emb_last + emb, shape=[-1, self.channels, self.emb_size]) + emb_out=self._cnn_layer(emb_reshape) + return emb_out \ No newline at end of file From 3e18097486f47db9b1e5a01ec24961944e4b5ea2 Mon Sep 17 00:00:00 2001 From: jinyuKing <2943829328@qq.com> Date: Fri, 8 May 2020 04:13:22 +0000 Subject: [PATCH 3/7] update text.py --- hapi/text/text.py | 108 +++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index 9fec48c..4b9d928 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -2010,31 +2010,29 @@ class CNNEncoder(Layer): """ This interface is used to construct a callable object of the ``CNNEncoder`` class.The ``CNNEncoder`` is composed of a ``Embedding`` and a ``Conv1dPoolLayer`` . For more details, refer to code examples. The ``CNNEncoder`` layer calculates the output based on the input, dict_size and emb_dim, filter_size, num_filters, - use_cuda, is_sparse, param_attr parameters. The type of Input data is a Tensor or a lod-tensor .The data type of Input data is 'int64'. Output data are in NCH + use_cuda, is_sparse, param_attr parameters. The type of Input data is a 3-D Tensor .The data type of Input data is 'float32'. Output data are in NCH format, where N is batch size, C is the number of the feature map, H is the height of the feature map. The data type of Output data is 'float32' or 'float64'. Args: - dict_size(int): the size of the dictionary of embeddings - emb_szie(int): the size of each embedding vector respectively. - num_channels(int): The number of channels in the input data.Default:1 - num_filters(int): The number of filters. It is the same as the output channels. - filter_size(int): The filter size of Conv1DPoolLayer in CNNEncoder. - pool_size(int): The pooling size of Conv1DPoolLayer in CNNEncoder. + num_channels(int|list|tuple): The number of channels in the input data.If num_channels is a list or tuple, the length of num_channels must equal layer_num.If num_channels + is a int, all conv1dpoollayer's num_channels are the value of num_channels. + num_filters(int|list|tuple): The number of filters. It is the same as the output channels. If num_filters is a list or tuple, the length of num_filters must equal layer_num.If num_filters + is a int, all conv1dpoollayer's num_filters are the value of num_filters. + filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder. If filter_size is a list or tuple, the length of filter_size must equal layer_num.If filter_size + is a int, all conv1dpoollayer's filter_size are the value of filter_size. + pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.If pool_size is a list or tuple, the length of pool_size must equal layer_num.If pool_size + is a int, all conv1dpoollayer's pool_size are the value of pool_size. + layer_num(int): The number of conv1dpoolLayer used in CNNEncoder. + conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer. If conv_stride is a list or tuple, the length of conv_stride must equal layer_num.If conv_stride + is a int, all conv1dpoollayer's conv_stride are the value of conv_stride. Default: 1 + pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer. If pool_stride is a list or tuple, the length of pool_stride must equal layer_num.If pool_stride + is a int, all conv1dpoollayer's pool_stride are the value of pool_stride. Default: 1 + conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.If conv_padding is a list or tuple, the length of conv_padding must equal layer_num.If conv_padding + is a int, all conv1dpoollayer's conv_padding are the value of conv_padding. Default: 0 + pool_padding(int|list|tuple): The padding of pool layer in Conv1DPoolLayer. If pool_padding is a list or tuple, the length of pool_padding must equal layer_num.If pool_padding + is a int, all conv1dpoollayer's pool_padding are the value of pool_padding. Default: 0 use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: False - is_sparse(bool): The flag indicating whether to use sparse update. This parameter only affects the performance of the backwards gradient update. It is recommended - to set True because sparse update is faster. But some optimizer does not support sparse update,such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , - :ref:`api_fluid_optimizer_AdamaxOptimizer` , :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` , - :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` . - In these case, is_sparse must be False. Default: True. - param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in - :ref:`api_fluid_ParamAttr` . In addition,user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. The local word vector - needs to be transformed into numpy format, and the shape of local word vector should be consistent with :attr:`size` . - Then :ref:`api_fluid_initializer_NumpyArrayInitializer` is used to load custom or pre-trained word vectors. Default: None. - padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). - If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted to :math:`vocab\_size + padding\_idx` . It will output all-zero padding - data whenever lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated while training. If set None, it makes no effect to - output. Default: None. - act (str): Activation type for `Conv1dPoollayer` layer, if it is set to None, activation is not appended. Default: None. + act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None, activation is not appended. Default: None. Return: 3-D Tensor, the result of input after embedding and conv1dPoollayer @@ -2047,53 +2045,55 @@ class CNNEncoder(Layer): import paddle.fluid as fluid from hapi.text import CNNEncoder - test=np.random.uniform(1,5,[2,3,4]).astype('int64') + test=np.random.uniform(1,5,[2,3,4]).astype('float32') with fluid.dygraph.guard(): paddle_input=to_variable(test) - print(paddle_input.shape) - cov2d=CNNEncoder(128,4,3,4,2,2) + #print(paddle_input.shape) + cov2d=CNNEncoder(3,4,2,2,3) paddle_out=cov2d(paddle_input) - print(paddle_out.shape)#[8,4,2] + print(paddle_out)#[2,12,2] ``` """ def __init__(self, - dict_size, - emb_size, num_channels, num_filters, filter_size, pool_size, - use_cuda=False, - is_sparse=True, - param_attr=None, - padding_idx=None, + layer_num, + conv_stride=1, + pool_stride=1, + conv_padding=0, + pool_padding=0, + use_cudnn=False, act=None ): super(CNNEncoder, self).__init__() - self.dict_size = dict_size - self.emb_size = emb_size - self.filter_size = filter_size - self.num_filters = num_filters - self.pool_size = pool_size - self.channels = num_channels - self._emb_layer = Embedding(size=[self.dict_size, self.emb_size], - is_sparse=is_sparse, - padding_idx=padding_idx, - param_attr=param_attr) - self._cnn_layer = Conv1dPoolLayer( - self.channels, - self.num_filters, - self.filter_size, - self.pool_size, - use_cudnn=use_cuda, - act=act - ) + self.num_channels=num_channels + self.num_filters=num_filters + self.filter_size=filter_size + self.pool_size=pool_size + self.layer_num=layer_num + self.conv_stride=conv_stride + self.pool_stride=pool_stride + self.conv_padding=conv_padding + self.pool_padding=pool_padding + self.use_cudnn=use_cudnn + self.act=act + self.conv_layer = fluid.dygraph.LayerList([Conv1dPoolLayer(num_channels=self.num_channels if isinstance(self.num_channels,int) else self.num_channels[i], + num_filters=self.num_filters if isinstance(self.num_channels,int) else self.num_filters [i], + filter_size=self.filter_size if isinstance(self.filter_size,int) else self.filter_size[i], + pool_size=self.pool_size if isinstance(self.pool_size,int) else self.pool_size[i], + conv_stride=self.conv_stride if isinstance(self.conv_stride,int) else self.conv_stride[i], + pool_stride=self.pool_stride if isinstance(self.pool_stride,int) else self.pool_stride[i], + conv_padding= self.conv_padding if isinstance(self.conv_padding,int) else self.conv_padding[i], + pool_padding=self.pool_padding if isinstance(self.pool_padding,int) else self.pool_padding[i], + act=self.act[i] if isinstance(self.act,(list,tuple)) else self.act, + use_cudnn=self.use_cudnn + ) for i in range(layer_num)]) def forward(self, input): - emb = self._emb_layer(input) - emb_reshape = fluid.layers.reshape( - emb, shape=[-1, self.channels, self.emb_size]) - emb_out=self._cnn_layer(emb_reshape) - return emb_out \ No newline at end of file + res=[Conv1dPoolLayer(input) for Conv1dPoolLayer in self.conv_layer] + out=fluid.layers.concat(input=res,axis=1) + return out \ No newline at end of file From 3b5e6d9cc12af93d14134aab25d59bb7a7f807fe Mon Sep 17 00:00:00 2001 From: jinyuKing <2943829328@qq.com> Date: Fri, 8 May 2020 09:34:39 +0000 Subject: [PATCH 4/7] update text.py --- hapi/text/text.py | 357 ++++++++++++++++++++++++++-------------------- 1 file changed, 204 insertions(+), 153 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index 4b9d928..10f5f9f 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -1898,14 +1898,16 @@ def forward(self, word, lengths, target=None): crf_decode = self.crf_decoding(input=emission, length=lengths) return crf_decode, lengths + class Conv1dPoolLayer(Layer): """ - This interface is used to construct a callable object of the ``Conv1DPoolLayer`` class.The ``Conv1DPoolLayer`` is composed of a ``Conv2D`` and a ``Pool2D`` . - For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates the output based on the input, filter and strides, paddings, dilations, - groups,global_pooling, pool_type,ceil_mode,exclusive parameters.Input and Output are in NCH format, where N is batch size, C is the number of the feature map, - H is the height of the feature map.The data type of Input data and Output data is 'float32' or 'float64'. + This interface is used to construct a callable object of the ``Conv1DPoolLayer`` class. + The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .For more details, + refer to code examples.The ``Conv1DPoolLayer`` layer calculates the output based on the + input, filter and strides, paddings, dilations, groups,global_pooling, pool_type,ceil_mode, + exclusive parameters. - Args: + Parameters: input(Variable):3-D Tensor, shape is [N, C, H], data type can be float32 or float64 num_channels(int): The number of channels in the input data. num_filters(int): The number of filters. It is the same as the output channels. @@ -1915,147 +1917,174 @@ class Conv1dPoolLayer(Layer): pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer. Default: 1 conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer. Default: 0 pool_padding (int): The padding of pool layer in Conv1DPoolLayer. Default: 0 - pool_type (str): Pooling type can be `max` for max-pooling or `avg` for average-pooling. Default: math:`max` - global_pooling (bool): Whether to use the global pooling. If global_pooling = true, pool_size and pool_padding while be ignored. Default: False + pool_type (str): Pooling type can be `max` for max-pooling or `avg` for average-pooling. + Default: math:`max` + global_pooling (bool): Whether to use the global pooling. If global_pooling = true, + pool_size and pool_padding while be ignored. Default: False dilation (int): The dilation size of the conv Layer. Default: 1. - groups (int): The groups number of the conv Layer. According to grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: 1. - param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of conv layer. If it is set to None or one attribute of - ParamAttr, conv2d will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized - with :`Normal(0.0, std)`,and the :`std` is :`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv.If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv2d will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not - set, the bias is initialized zero. Default: None. - name(str, optional): The default value is None. Normally there is no need for user to set this property. Default: None - act (str): Activation type for conv layer, if it is set to None, activation is not appended. Default: None. - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: False - ceil_mode (bool, optional): Whether to use the ceil function to calculate output height and width. - False is the default. If it is set to False, the floor function will be used. Default: False. - exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True. - - Return: - 3-D Tensor, the result of input after conv and pool, with the same data type as :`input` - - Return Type: - Variable + groups (int): The groups number of the conv Layer. According to grouped convolution in + Alex Krizhevsky's Deep CNN paper: when group=2, the first half of the filters is + only connected to the first half of the input channels, while the second half + of the filters is only connected to the second half of the input channels. + Default: 1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv layer. If it is set to None or one attribute of ParamAttr, conv2d will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :`Normal(0.0, std)`,and + the :`std` is :`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv.If it + is set to False, no bias will be added to the output units.If it is set to + None or one attribute of ParamAttr, conv2d will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + Default: None. + act (str): Activation type for conv layer, if it is set to None, activation is not appended. + Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library + is installed. Default: False + ceil_mode (bool, optional): Whether to use the ceil function to calculate output + height and width.False is the default. If it is set to False, the floor function + will be used. Default: False. + exclusive (bool, optional): Whether to exclude padding points in average pooling mode. + Default: True. Example: - ```python - import paddle.fluid as fluid - from hapi.text import Conv1dPoolLayer - - test=np.random.uniform(-1,1,[2,3,4]).astype('float32') - with fluid.dygraph.guard(): - paddle_input=to_variable(test) - print(paddle_input.shape) - cov2d=Conv1dPoolLayer(3,4,2,2) - paddle_out=cov2d(paddle_input) - print(paddle_out.shape)#[2,4,2] - - ``` + + .. code-block:: python + + import paddle.fluid as fluid + from paddle.incubate.hapi.text import Conv1dPoolLayer + + test=np.random.uniform(-1,1,[2,3,4]).astype('float32') + with fluid.dygraph.guard(): + paddle_input=to_variable(test) + cov2d=Conv1dPoolLayer(3,4,2,2) + paddle_out=cov2d(paddle_input) + print(paddle_out.shape)#[2,4,2] + """ - def __init__(self, - num_channels, - num_filters, - filter_size, - pool_size, - conv_stride=1, - pool_stride=1, - conv_padding=0, - pool_padding=0, - pool_type='max', - global_pooling=False, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - act=None, - use_cudnn=False, - ceil_mode=False, - exclusive=True, - ): + + def __init__( + self, + num_channels, + num_filters, + filter_size, + pool_size, + conv_stride=1, + pool_stride=1, + conv_padding=0, + pool_padding=0, + pool_type='max', + global_pooling=False, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + act=None, + use_cudnn=False, + ceil_mode=False, + exclusive=True, ): super(Conv1dPoolLayer, self).__init__() - self._conv2d = Conv2D(num_channels=num_channels, - num_filters=num_filters, - filter_size=[filter_size,1], - stride=[conv_stride,1], - padding=[conv_padding,0], - dilation=[dilation,1], - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act) - self._pool2d = Pool2D(pool_size=[pool_size,1], - pool_type=pool_type, - pool_stride=[pool_stride,1], - pool_padding=[pool_padding,0], - global_pooling=global_pooling, - use_cudnn=use_cudnn, - ceil_mode=ceil_mode, - exclusive=exclusive - ) + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=[filter_size, 1], + stride=[conv_stride, 1], + padding=[conv_padding, 0], + dilation=[dilation, 1], + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act) + self._pool2d = Pool2D( + pool_size=[pool_size, 1], + pool_type=pool_type, + pool_stride=[pool_stride, 1], + pool_padding=[pool_padding, 0], + global_pooling=global_pooling, + use_cudnn=use_cudnn, + ceil_mode=ceil_mode, + exclusive=exclusive) + def forward(self, inputs): - x = fluid.layers.unsqueeze(inputs,axes=[-1]) + """ + Performs :Code:`Conv1dPoolLayer.forward` receives input data. After a conv and a pool, + the result will be output. + + Parameters: + inputs(Variable): Inputs are 3-D Tensor, shape is [N, C, H] , where N is batch size, + C is the number of the feature map, H is the height of the feature map.The data + type of Input data is 'float32' or 'float64'. + + Returns: + 3-D Tensor, with the same data type as :`input` + """ + x = fluid.layers.unsqueeze(inputs, axes=[-1]) x = self._conv2d(x) x = self._pool2d(x) x = fluid.layers.squeeze(x, axes=[-1]) return x - - class CNNEncoder(Layer): """ - This interface is used to construct a callable object of the ``CNNEncoder`` class.The ``CNNEncoder`` is composed of a ``Embedding`` and a ``Conv1dPoolLayer`` . - For more details, refer to code examples. The ``CNNEncoder`` layer calculates the output based on the input, dict_size and emb_dim, filter_size, num_filters, - use_cuda, is_sparse, param_attr parameters. The type of Input data is a 3-D Tensor .The data type of Input data is 'float32'. Output data are in NCH - format, where N is batch size, C is the number of the feature map, H is the height of the feature map. The data type of Output data is 'float32' or 'float64'. - - Args: - num_channels(int|list|tuple): The number of channels in the input data.If num_channels is a list or tuple, the length of num_channels must equal layer_num.If num_channels - is a int, all conv1dpoollayer's num_channels are the value of num_channels. - num_filters(int|list|tuple): The number of filters. It is the same as the output channels. If num_filters is a list or tuple, the length of num_filters must equal layer_num.If num_filters - is a int, all conv1dpoollayer's num_filters are the value of num_filters. - filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder. If filter_size is a list or tuple, the length of filter_size must equal layer_num.If filter_size - is a int, all conv1dpoollayer's filter_size are the value of filter_size. - pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.If pool_size is a list or tuple, the length of pool_size must equal layer_num.If pool_size - is a int, all conv1dpoollayer's pool_size are the value of pool_size. - layer_num(int): The number of conv1dpoolLayer used in CNNEncoder. - conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer. If conv_stride is a list or tuple, the length of conv_stride must equal layer_num.If conv_stride - is a int, all conv1dpoollayer's conv_stride are the value of conv_stride. Default: 1 - pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer. If pool_stride is a list or tuple, the length of pool_stride must equal layer_num.If pool_stride - is a int, all conv1dpoollayer's pool_stride are the value of pool_stride. Default: 1 - conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.If conv_padding is a list or tuple, the length of conv_padding must equal layer_num.If conv_padding - is a int, all conv1dpoollayer's conv_padding are the value of conv_padding. Default: 0 - pool_padding(int|list|tuple): The padding of pool layer in Conv1DPoolLayer. If pool_padding is a list or tuple, the length of pool_padding must equal layer_num.If pool_padding - is a int, all conv1dpoollayer's pool_padding are the value of pool_padding. Default: 0 - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: False - act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None, activation is not appended. Default: None. - - Return: - 3-D Tensor, the result of input after embedding and conv1dPoollayer - - Return Type: - Variable + This interface is used to construct a callable object of the ``CNNEncoder`` class.The ``CNNEncoder`` + is composed of ``Conv1dPoolLayer`` .``CNNEncoder`` can define every Conv1dPoolLayer with different + or same parameters. The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel.The result of every + ``Conv1dPoolLayer`` will concat as the final output.For more details, refer to code examples. + The ``CNNEncoder`` layer calculates the output based on the input, dict_size and emb_dim, + filter_size, num_filters, use_cuda, is_sparse, param_attr parameters. + + Parameters: + num_channels(int|list|tuple): The number of channels in the input data.If num_channels is + a list or tuple, the length of num_channels must equal layer_num.If num_channels + is a int, all conv1dpoollayer's num_channels are the value of num_channels. + num_filters(int|list|tuple): The number of filters. It is the same as the output channels. + If num_filters is a list or tuple, the length of num_filters must equal layer_num. + If num_filters is a int, all conv1dpoollayer's num_filters are the value of num_filters. + filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder. If filter_size + is a list or tuple, the length of filter_size must equal layer_num.If filter_size is a + int, all conv1dpoollayer's filter_size are the value of filter_size. + pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.If pool_size is + a list or tuple, the length of pool_size must equal layer_num.If pool_size is a int, + all conv1dpoollayer's pool_size are the value of pool_size. + layer_num(int): The number of conv1dpoolLayer used in CNNEncoder. + conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer. If + conv_stride is a list or tuple, the length of conv_stride must equal layer_num. + If conv_stride is a int, all conv1dpoollayer's conv_stride are the value of + conv_stride. Default: 1 + pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer. If + pool_stride is a list or tuple, the length of pool_stride must equal layer_num. + If pool_stride is a int, all conv1dpoollayer's pool_stride are the value of + pool_stride. Default: 1 + conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer. + If conv_padding is a list or tuple, the length of conv_padding must equal layer_num. + If conv_padding is a int, all conv1dpoollayer's conv_padding are the value of + conv_padding. Default: 0 + pool_padding(int|list|tuple): The padding of pool layer in Conv1DPoolLayer. If pool_padding is + a list or tuple, the length of pool_padding must equal layer_num.If pool_padding is a + int, all conv1dpoollayer's pool_padding are the value of pool_padding. Default: 0 + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. + Default: False + act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None, + activation is not appended. Default: None. Example: - ```python - import paddle.fluid as fluid - from hapi.text import CNNEncoder - test=np.random.uniform(1,5,[2,3,4]).astype('float32') - with fluid.dygraph.guard(): - paddle_input=to_variable(test) - #print(paddle_input.shape) - cov2d=CNNEncoder(3,4,2,2,3) - paddle_out=cov2d(paddle_input) - print(paddle_out)#[2,12,2] + .. code-block:: python + + import paddle.fluid as fluid + from paddle.incubate.hapi.text import Conv1dPoolLayer - ``` + test=np.random.uniform(-1,1,[2,3,4]).astype('float32') + with fluid.dygraph.guard(): + paddle_input=to_variable(test) + cov2d=CNNEncoder(3,4,2,2,3) + paddle_out=cov2d(paddle_input) + print(paddle_out)#[2,12,2] """ + def __init__(self, num_channels, num_filters, @@ -2067,33 +2096,55 @@ def __init__(self, conv_padding=0, pool_padding=0, use_cudnn=False, - act=None - ): + act=None): super(CNNEncoder, self).__init__() - self.num_channels=num_channels - self.num_filters=num_filters - self.filter_size=filter_size - self.pool_size=pool_size - self.layer_num=layer_num - self.conv_stride=conv_stride - self.pool_stride=pool_stride - self.conv_padding=conv_padding - self.pool_padding=pool_padding - self.use_cudnn=use_cudnn - self.act=act - self.conv_layer = fluid.dygraph.LayerList([Conv1dPoolLayer(num_channels=self.num_channels if isinstance(self.num_channels,int) else self.num_channels[i], - num_filters=self.num_filters if isinstance(self.num_channels,int) else self.num_filters [i], - filter_size=self.filter_size if isinstance(self.filter_size,int) else self.filter_size[i], - pool_size=self.pool_size if isinstance(self.pool_size,int) else self.pool_size[i], - conv_stride=self.conv_stride if isinstance(self.conv_stride,int) else self.conv_stride[i], - pool_stride=self.pool_stride if isinstance(self.pool_stride,int) else self.pool_stride[i], - conv_padding= self.conv_padding if isinstance(self.conv_padding,int) else self.conv_padding[i], - pool_padding=self.pool_padding if isinstance(self.pool_padding,int) else self.pool_padding[i], - act=self.act[i] if isinstance(self.act,(list,tuple)) else self.act, - use_cudnn=self.use_cudnn - ) for i in range(layer_num)]) + self.num_channels = num_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.pool_size = pool_size + self.layer_num = layer_num + self.conv_stride = conv_stride + self.pool_stride = pool_stride + self.conv_padding = conv_padding + self.pool_padding = pool_padding + self.use_cudnn = use_cudnn + self.act = act + self.conv_layer = fluid.dygraph.LayerList([ + Conv1dPoolLayer( + num_channels=self.num_channels if + isinstance(self.num_channels, int) else self.num_channels[i], + num_filters=self.num_filters + if isinstance(self.num_channels, int) else self.num_filters[i], + filter_size=self.filter_size + if isinstance(self.filter_size, int) else self.filter_size[i], + pool_size=self.pool_size + if isinstance(self.pool_size, int) else self.pool_size[i], + conv_stride=self.conv_stride + if isinstance(self.conv_stride, int) else self.conv_stride[i], + pool_stride=self.pool_stride + if isinstance(self.pool_stride, int) else self.pool_stride[i], + conv_padding=self.conv_padding if + isinstance(self.conv_padding, int) else self.conv_padding[i], + pool_padding=self.pool_padding if + isinstance(self.pool_padding, int) else self.pool_padding[i], + act=self.act[i] + if isinstance(self.act, (list, tuple)) else self.act, + use_cudnn=self.use_cudnn) for i in range(layer_num) + ]) def forward(self, input): - res=[Conv1dPoolLayer(input) for Conv1dPoolLayer in self.conv_layer] - out=fluid.layers.concat(input=res,axis=1) - return out \ No newline at end of file + """ + Performs :Code:`CNNEncoder.forward` receives input data.The result of every ``Conv1dPoolLayer`` + will store in a list. Then, the results will be concated as the final output. + + Parameters: + inputs(Variable): Inputs are 3-D Tensor, shape is [N, C, H] , where N is batch size, + C is the number of the feature map, H is the height of the feature map.The data + type of Input data is 'float32' or 'float64'. + + Returns: + 3-D Tensor, with the same data type as :`input` + """ + res = [Conv1dPoolLayer(input) for Conv1dPoolLayer in self.conv_layer] + out = fluid.layers.concat(input=res, axis=1) + return out From 4f3aebce1a6effb5ee697c36a7ffadd622e466cc Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 28 May 2020 04:38:59 +0000 Subject: [PATCH 5/7] add sim_net model --- examples/similarity_net/README.md | 199 ++++++++ examples/similarity_net/config.py | 60 +++ examples/similarity_net/download.py | 154 +++++++ examples/similarity_net/download_data.sh | 5 + examples/similarity_net/nets/bow.py | 115 +++++ examples/similarity_net/nets/cnn.py | 115 +++++ .../similarity_net/nets/losses/hinge_loss.py | 39 ++ .../similarity_net/nets/losses/log_loss.py | 32 ++ .../nets/losses/softmax_cross_entropy_loss.py | 31 ++ examples/similarity_net/reader.py | 280 ++++++++++++ examples/similarity_net/run.sh | 101 +++++ examples/similarity_net/run_classifier.py | 426 ++++++++++++++++++ examples/similarity_net/utils.py | 244 ++++++++++ 13 files changed, 1801 insertions(+) create mode 100644 examples/similarity_net/README.md create mode 100644 examples/similarity_net/config.py create mode 100644 examples/similarity_net/download.py create mode 100644 examples/similarity_net/download_data.sh create mode 100644 examples/similarity_net/nets/bow.py create mode 100644 examples/similarity_net/nets/cnn.py create mode 100644 examples/similarity_net/nets/losses/hinge_loss.py create mode 100644 examples/similarity_net/nets/losses/log_loss.py create mode 100644 examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py create mode 100644 examples/similarity_net/reader.py create mode 100644 examples/similarity_net/run.sh create mode 100644 examples/similarity_net/run_classifier.py create mode 100644 examples/similarity_net/utils.py diff --git a/examples/similarity_net/README.md b/examples/similarity_net/README.md new file mode 100644 index 0000000..4f7270b --- /dev/null +++ b/examples/similarity_net/README.md @@ -0,0 +1,199 @@ +# 短文本语义匹配 +## 简介 +### 任务说明 +短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。 + +### 效果说明 +基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和进行评测,效果如下表所示。 + + +| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | +|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| +| | AUC | AUC | AUC|正逆序比| +|BOW_Pairwise|0.6815|0.7331|0.7638|1.5565| + + +#### 测试集说明 +| 数据集 | 来源 | 垂类 | +|:-----------:|:-------------:|:-------------:| +|百度知道 | 百度知道问题 | 日常 | +|ECOM|商业问句|金融| +|QQSIM|闲聊对话|日常| +|UNICOM|联通客服|客服| +## 快速开始 +#### 版本依赖 + +本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + + +#### 安装代码 +克隆工具集代码库到本地 +```shell +git clone https://github.com/PaddlePaddle/models.git + +cd models/dygraph/similarity_net +``` +#### 数据准备 +下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 + +```shell +sh download_data.sh +``` +或者 +``` +python download.py dataset +``` +#### 模型准备 +我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),用户可以通过运行命令下载预训练好的模型,该模型将保存在```./model_files/simnet_bow_pairwise_pretrained_model/```下。 +```shell +sh download_pretrained_model.sh +``` +或者 + +``` +python download.py model +``` + +#### 评估 +我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。 +```shell +sh evaluate_ecom.sh +sh evaluate_qqsim.sh +sh evaluate_zhidao.sh +sh evaluate_unicom.sh +``` +用户也可以指定./run.sh中的TEST_DATA_PATH的值,通过下列命令评估自己指定的测试集。 +```shell +sh run.sh eval +``` + +#### 推测 +基于上面的预训练模型,可以运行下面的命令进行推测,并保存推测结果到本地。 +```shell +sh run.sh infer +``` +#### 训练与验证 +用户可以基于示例数据构建训练集和开发集,可以运行下面的命令,进行模型训练和开发集验证。 +```shell +sh run.sh train +``` +用户也可以指定./run.sh中train()函数里的INIT_CHECKPOINT的值,载入训练好的模型进行热启动训练。 +## 进阶使用 + +### 任务定义与建模 + +传统的文本匹配技术如信息检索中的向量空间模型 VSM、BM25 等算法,主要解决词汇层面的相似度问题,这种方法的效果在实际应用中受到语言的多义词和语言结构等问题影响。SimNet 在语义表示上沿袭了隐式连续向量表示的方式,但对语义匹配问题在深度学习框架下进行了 End-to-End 的建模,将```point-wise```与 ```pair-wise```两种有监督学习方式全部统一在一个整体框架内。在实际应用场景下,将海量的用户点击行为数据转化为大规模的弱标记数据,在网页搜索任务上的初次使用即展现出极大威力,带来了相关性的明显提升。 + +### 模型原理介绍 + +SimNet如下图所示: + +

+
+

+ +### 数据格式说明 + +训练模式一共分为```pairwise```和```pointwise```两种模式。 + +#### pairwise模式: +训练集格式如下: query \t pos_query \t neg_query。 +query、pos_query和neg_query是以空格分词的中文文本,中间使用制表符'\t'隔开,pos_query表示与query相似的正例,neg_query表示与query不相似的随机负例,文本编码为utf-8。
+``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 电信 手机 可以 用 腾讯 大王 卡 吗 ? +土豆 一亩地 能 收 多少 斤 一亩 地土豆 产 多少 斤 一亩 地 用 多少 斤 土豆 种子 +``` + +开发集和测试集格式:query1 \t query2 \t label。
+ +query1和query2表示以空格分词的中文文本,label为0或1,1表示query1与query2相似,0表示query1与query2不相似,query1、query2和label中间以制表符'\t'隔开,文本编码为utf-8。
+``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 1 +为什么 头发 掉 得 很厉害 我 头发 为什么 掉 得 厉害 1 +常喝 薏米 水 有 副 作用 吗 女生 可以 长期 喝 薏米 水养生 么 0 +长 的 清新 是 什么 意思 小 清新 的 意思 是 什么 0 +``` + +#### pointwise模式: + +训练集、开发集和测试集数据格式相同:query1和query2表示以空格分词的中文文本,label为0或1,1表示query1与query2相似,0表示query1与query2不相似,query1、query2和label中间以制表符'\t'隔开,文本编码为utf-8。 +``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 1 +为什么 头发 掉 得 很厉害 我 头发 为什么 掉 得 厉害 1 +常喝 薏米 水 有 副 作用 吗 女生 可以 长期 喝 薏米 水养生 么 0 +长 的 清新 是 什么 意思 小 清新 的 意思 是 什么 0 +``` + +#### infer数据集: + +```pairwise```和```pointwise```的infer数据集格式相同:query1 \t query2。
+ +query1和query2为以空格分词的中文文本。 +``` +怎么 调理 湿热 体质 ? 湿热 体质 怎样 调理 啊 +搞笑 电影 美国 搞笑 的 美国 电影 +``` + +__注__:本项目额外提供了分词预处理脚本(在preprocess目录下),可供用户使用,具体使用方法如下: + +```shell +python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.utf8.seg +``` +其中test.txt.utf8为待分词的文件,一条文本数据一行,utf8编码,分词结果存放在test.txt.utf8.seg文件中 + +### 代码结构说明 +```text +. +├── run_classifier.py:该项目的主函数,封装包括训练、预测、评估的部分 +├── config.py:定义该项目模型的配置类,读取具体模型类别、以及模型的超参数等 +├── reader.py:定义了读入数据的相关函数 +├── utils.py:定义了其他常用的功能函数 +├── Config: 定义多种模型的配置文件 +├── download.py: 下载数据及预训练模型脚本 +├── nets: 基于动态图的网络结构 +``` + +### 如何训练 +```shell +python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ #是否使用GPU + --do_train True \ #是否训练 + --do_valid True \ #是否在训练中测试开发集 + --do_test True \ #是否验证测试集 + --do_infer False \ #是否预测 + --batch_size 128 \ #batch_size的值 + --train_data_dir ${TRAIN_DATA_kPATH} \ #训练集的路径 + --valid_data_dir ${VALID_DATA_PATH} \ #开发集的路径 + --test_data_dir ${TEST_DATA_PATH} \ #测试集的路径 + --infer_data_dir ${INFER_DATA_PATH} \ #待推测数据的路径 + --output_dir ${CKPT_PATH} \ #模型存放的路径 + --config_path ${CONFIG_PATH} \ #配置文件路径 + --vocab_path ${VOCAB_PATH} \ #字典路径 + --epoch 10 \ #epoch值 + --save_steps 1000 \ #每save_steps保存一次模型 + --validation_steps 100 \ #每validation_steps验证一次开发集结果 + --task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。 + --compute_accuracy False \ #是否计算accuracy + --lamda 0.91 \ #pairwise模式计算accuracy时的阈值 + --init_checkpoint "" #预加载模型路径 +``` +### 如何组建自己的模型 +用户可以根据自己的需求,组建自定义的模型,具体方法如下所示: + +i. 定义自己的网络结构 + +用户可以在```./nets/```下定义自己的模型; + +ii. 更改模型配置 + +用户仿照```config```中的文件生成自定义模型的配置文件。 + +用户需要保留配置文件中的```net```、```loss```、```optimizer```、```task_mode```和```model_path```字段。```net```为用户自定义的模型参数,```task_mode```表示训练模式,为```pairwise```或```pointwise```,要与训练命令中的```--task_mode```命令保持一致,```model_path```为模型保存路径,```loss```和```optimizer```依据自定义模型的需要仿照```config```下的其他文件填写。 + + +iii.模型训练,运行训练、评估、预测脚本即可(具体方法同上)。 + +## 其他 +### 如何贡献代码 +如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。 diff --git a/examples/similarity_net/config.py b/examples/similarity_net/config.py new file mode 100644 index 0000000..bfd3260 --- /dev/null +++ b/examples/similarity_net/config.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet config +""" + +import six +import json +import io + + +class SimNetConfig(object): + """ + simnet Config + """ + + def __init__(self, args): + self.task_mode = args.task_mode + self.config_path = args.config_path + self._config_dict = self._parse(args.config_path) + + def _parse(self, config_path): + try: + with io.open(config_path) as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing simnet model config file '%s'" % + config_path) + + else: + if config_dict["task_mode"] != self.task_mode: + raise ValueError( + "the config '{}' does not match the task_mode '{}'".format( + self.config_path, self.task_mode)) + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def __setitem__(self, key, value): + self._config_dict[key] = value + + def print_config(self): + """ + Print Config + """ + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') diff --git a/examples/similarity_net/download.py b/examples/similarity_net/download.py new file mode 100644 index 0000000..93b69f6 --- /dev/null +++ b/examples/similarity_net/download.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Download script, download dataset and pretrain models. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import time +import hashlib +import tarfile +import requests + + +def usage(): + desc = ( + "\nDownload datasets and pretrained models for SimilarityNet task.\n" + "Usage:\n" + " 1. python download.py dataset\n" + " 2. python download.py model\n") + print(desc) + + +def md5file(fname): + hash_md5 = hashlib.md5() + with io.open(fname, "rb") as fin: + for chunk in iter(lambda: fin.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def extract(fname, dir_path): + """ + Extract tar.gz file + """ + try: + tar = tarfile.open(fname, "r:gz") + file_names = tar.getnames() + for file_name in file_names: + tar.extract(file_name, dir_path) + print(file_name) + tar.close() + except Exception as e: + raise e + + +def download(url, filename, md5sum): + """ + Download file and check md5 + """ + retry = 0 + retry_limit = 3 + chunk_size = 4096 + + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError( + "Cannot download dataset ({0}) with retry {1} times.".format( + url, retry_limit)) + try: + start = time.time() + size = 0 + res = requests.get(url, stream=True) + filesize = int(res.headers['content-length']) + if res.status_code == 200: + print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024)) + # save by chunk + with io.open(filename, "wb") as fout: + for chunk in res.iter_content(chunk_size=chunk_size): + if chunk: + fout.write(chunk) + size += len(chunk) + pr = '>' * int(size * 50 / filesize) + print( + '\r[Process ]: %s%.2f%%' % + (pr, float(size / filesize * 100)), + end='') + end = time.time() + print("\n[CostTime]: %.2f s" % (end - start)) + except Exception as e: + print(e) + + +def download_dataset(dir_path): + BASE_URL = "https://baidu-nlp.bj.bcebos.com/" + DATASET_NAME = "simnet_dataset-1.0.0.tar.gz" + DATASET_MD5 = "ec65b313bc237150ef536a8d26f3c73b" + file_path = os.path.join(dir_path, DATASET_NAME) + url = BASE_URL + DATASET_NAME + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + # download dataset + print("Downloading dataset: %s" % url) + download(url, file_path, DATASET_MD5) + # extract dataset + print("Extracting dataset: %s" % file_path) + extract(file_path, dir_path) + os.remove(file_path) + + +def download_model(dir_path): + MODELS = {} + BASE_URL = "https://baidu-nlp.bj.bcebos.com/" + CNN_NAME = "simnet_bow_pairwise_dygraph.tar.gz" + CNN_MD5 = "30012af0ca8cdf0c613d8f56884f0f48" + MODELS[CNN_NAME] = CNN_MD5 + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + for model in MODELS: + url = BASE_URL + model + model_path = os.path.join(dir_path, model) + print("Downloading model: %s" % url) + # download model + download(url, model_path, MODELS[model]) + # extract model.tar.gz + print("Extracting model: %s" % model_path) + extract(model_path, dir_path) + os.remove(model_path) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + usage() + sys.exit(1) + + if sys.argv[1] == "dataset": + pwd = os.path.join(os.path.dirname(__file__), './') + download_dataset(pwd) + elif sys.argv[1] == "model": + pwd = os.path.join(os.path.dirname(__file__), './model_files') + download_model(pwd) + else: + usage() diff --git a/examples/similarity_net/download_data.sh b/examples/similarity_net/download_data.sh new file mode 100644 index 0000000..ea1aaf9 --- /dev/null +++ b/examples/similarity_net/download_data.sh @@ -0,0 +1,5 @@ +#get data +wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz +tar xzf simnet_dataset-1.0.0.tar.gz +rm simnet_dataset-1.0.0.tar.gz + diff --git a/examples/similarity_net/nets/bow.py b/examples/similarity_net/nets/bow.py new file mode 100644 index 0000000..bbe9b14 --- /dev/null +++ b/examples/similarity_net/nets/bow.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +bow class +""" +import paddle.fluid as fluid +from paddle.fluid.dygraph import Linear, Layer, Embedding +from paddle.incubate.hapi.model import Model + + +#1. define BOWEncoder +class BOWEncoder(Layer): + """ + simple BOWEncoder for simnet + """ + + def __init__(self, dict_size, bow_dim, seq_len, emb_dim, padding_idx): + super(BOWEncoder, self).__init__() + self.dict_size = dict_size + self.bow_dim = bow_dim + self.seq_len = seq_len + self.emb_dim = emb_dim + self.padding_idx = padding_idx + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) + + def forward(self, input): + emb = self.emb_layer(input) + emb_reshape = fluid.layers.reshape( + emb, shape=[-1, self.seq_len, self.bow_dim]) + bow_emb = fluid.layers.reduce_sum(emb_reshape, dim=1) + return bow_emb + + +class Pair_BOWModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Pair_BOWModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.bow_dim = conf_dict["net"]["bow_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, + self.emb_dim, self.padding_idx) + self.bow_layer = Linear( + input_dim=self.bow_dim, output_dim=self.bow_dim) + + def forward(self, left, pos_right, neg_right): + bow_left = self.emb_layer(left) + pos_bow_right = self.emb_layer(pos_right) + neg_bow_right = self.emb_layer(neg_right) + left_soft = fluid.layers.softsign(bow_left) + pos_right_soft = fluid.layers.softsign(pos_bow_right) + neg_right_soft = fluid.layers.softsign(neg_bow_right) + + left_bow = self.bow_layer(left_soft) + pos_right_bow = self.bow_layer(pos_right_soft) + neg_right_bow = self.bow_layer(neg_right_soft) + pos_pred = fluid.layers.cos_sim(left_bow, pos_right_bow) + neg_pred = fluid.layers.cos_sim(left_bow, neg_right_bow) + return pos_pred, neg_pred + + +class Point_BOWModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Point_BOWModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.bow_dim = conf_dict["net"]["bow_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, + self.emb_dim, self.padding_idx) + self.bow_layer_po = Linear( + input_dim=self.bow_dim * 2, output_dim=self.bow_dim) + self.softmax_layer = Linear( + input_dim=self.bow_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + bow_left = self.emb_layer(left) + bow_right = self.emb_layer(right) + left_soft = fluid.layers.softsign(bow_left) + right_soft = fluid.layers.softsign(bow_right) + + concat = fluid.layers.concat([left_soft, right_soft], axis=1) + concat_fc = self.bow_layer_po(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/nets/cnn.py b/examples/similarity_net/nets/cnn.py new file mode 100644 index 0000000..97dbb2f --- /dev/null +++ b/examples/similarity_net/nets/cnn.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +cnn class +""" +import paddle.fluid as fluid +from paddle.fluid.dygraph import Linear, Layer, Conv2D, Pool2D +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.text.text import CNNEncoder + + +class Pair_CNNModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Pair_CNNModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.filter_size = conf_dict["net"]["filter_size"] + self.num_filters = conf_dict["net"]["num_filters"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + #layers + self.encoder_layer = CNNEncoder( + num_channels=1, + num_filters=self.num_filters, + filter_size=self.filter_size, + pool_size=1, + layer_num=1, + act='relu') + self.fc_layer = Linear( + input_dim=self.num_filters * self.seq_len, + output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.num_filters * self.seq_len * 2, + output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, pos_right, neg_right): + left = fluid.layers.reshape( + left, shape=[-1, self.seq_len, self.hidden_dim]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, self.seq_len, self.hidden_dim]) + neg_right = fluid.layers.reshape( + neg_right, shape=[-1, self.seq_len, self.hidden_dim]) + left_cnn = self.encoder_layer(left) + pos_right_cnn = self.encoder_layer(pos_right) + neg_right_cnn = self.encoder_layer(neg_right) + left_fc = self.fc_layer(left_cnn) + pos_right_fc = self.fc_layer(pos_right_cnn) + neg_right_fc = self.fc_layer(neg_right_cnn) + pos_pred = fluid.layers.cos_sim(left_fc, pos_right_fc) + neg_pred = fluid.layers.cos_sim(left_fc, neg_right_fc) + return pos_pred, neg_pred + + +class Point_CNNModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Point_CNNModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.filter_size = conf_dict["net"]["filter_size"] + self.num_filters = conf_dict["net"]["num_filters"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + #layers + self.encoder_layer = CNNEncoder( + num_channels=1, + num_filters=self.num_filters, + filter_size=self.filter_size, + pool_size=1, + layer_num=1, + act='relu') + self.fc_layer = Linear( + input_dim=self.num_filters * self.seq_len, + output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.num_filters * self.seq_len * 2, + output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + left = fluid.layers.reshape( + left, shape=[-1, self.seq_len, self.hidden_dim]) + right = fluid.layers.reshape( + right, shape=[-1, self.seq_len, self.hidden_dim]) + left_cnn = self.encoder_layer(left) + right_cnn = self.encoder_layer(right) + concat = fluid.layers.concat([left_cnn, right_cnn], axis=1) + concat_fc = self.fc_layer_po(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/nets/losses/hinge_loss.py b/examples/similarity_net/nets/losses/hinge_loss.py new file mode 100644 index 0000000..6081f8f --- /dev/null +++ b/examples/similarity_net/nets/losses/hinge_loss.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +hinge loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class HingeLoss(Loss): + def __init__(self, conf_dict): + super(HingeLoss, self).__init__() + self.margin = conf_dict["loss"]["margin"] + + def forward(self, outputs, labels=None): + pos, neg = outputs + loss = fluid.layers.fill_constant_batch_size_like(neg, neg.shape, + "float32", 0.0) + loss_margin = fluid.layers.fill_constant_batch_size_like( + neg, neg.shape, "float32", self.margin) + sub = fluid.layers.elementwise_sub(neg, pos) + add = fluid.layers.elementwise_add(sub, loss_margin) + loss_max = fluid.layers.elementwise_max(loss, add) + loss_last = fluid.layers.reduce_mean(loss_max) + return loss_last diff --git a/examples/similarity_net/nets/losses/log_loss.py b/examples/similarity_net/nets/losses/log_loss.py new file mode 100644 index 0000000..a11d1e4 --- /dev/null +++ b/examples/similarity_net/nets/losses/log_loss.py @@ -0,0 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +log loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class LogLoss(Loss): + def __init__(self, conf_dict): + super(LogLoss, self).__init__() + + def forward(self, outputs, labels=None): + pos, neg = outputs + loss = fluid.layers.sigmoid(neg - pos) + avg_loss = fluid.layers.reduce_mean(loss) + return loss diff --git a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py new file mode 100644 index 0000000..ec19c5a --- /dev/null +++ b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +softmax loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class SoftmxCrossEntropyLoss(Loss): + def __init__(self, conf_dict): + super(SoftmxCrossEntropyLoss, self).__init__() + + def forward(self, input, label): + cost = fluid.layers.cross_entropy(input=input, label=label) + avg_cost = fluid.layers.reduce_mean(cost) + return avg_cost diff --git a/examples/similarity_net/reader.py b/examples/similarity_net/reader.py new file mode 100644 index 0000000..d61a83c --- /dev/null +++ b/examples/similarity_net/reader.py @@ -0,0 +1,280 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet reader +""" + +import logging +import numpy as np +import io + + +class SimNetProcessor(object): + def __init__(self, args, vocab): + self.args = args + # load vocab + self.vocab = vocab + self.valid_label = np.array([]) + self.test_label = np.array([]) + + self.seq_len = args.seq_len + + def padding_text(self, x): + if len(x) < self.seq_len: + x += [0] * (self.seq_len - len(x)) + if len(x) > self.seq_len: + x = x[0:self.seq_len] + return x + + def get_reader(self, mode, epoch=0): + """ + Get Reader + """ + + def reader_with_pairwise(): + """ + Reader with Pairwise + """ + if mode == "valid": + with io.open( + self.args.valid_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + label = [1 if int(label) == 1 else 0] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = self.padding_text(label) + + yield [query, title, label] + elif mode == "test": + with io.open( + self.args.test_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + label = [1 if int(label) == 1 else 0] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = self.padding_text(label) + + yield [query, title, label] + else: + for idx in range(epoch): + with io.open( + self.args.train_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, pos_title, neg_title = line.strip().split( + "\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + pos_title = [ + self.vocab[word] + for word in pos_title.split(" ") + if word in self.vocab + ] + neg_title = [ + self.vocab[word] + for word in neg_title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(pos_title) == 0: + pos_title = [0] + if len(neg_title) == 0: + neg_title = [0] + + query = self.padding_text(query) + pos_title = self.padding_text(pos_title) + neg_title = self.padding_text(neg_title) + + yield [query, pos_title, neg_title] + + def reader_with_pointwise(): + """ + Reader with Pointwise + """ + if mode == "valid": + with io.open( + self.args.valid_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = int(label) + + yield [query, title, label] + elif mode == "test": + with io.open( + self.args.test_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + lebel = int(label) + + yield [query, title, label] + else: + for idx in range(epoch): + with io.open( + self.args.train_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = int(label) + + yield [query, title, label] + + if self.args.task_mode == "pairwise": + return reader_with_pairwise + else: + return reader_with_pointwise + + def get_infer_reader(self): + """ + get infer reader + """ + with io.open(self.args.infer_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + + yield [query, title] + + def get_infer_pairdata(self): + """ + get infer data + """ + with io.open(self.args.infer_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title = line.strip().split("\t") + yield line.strip() + + def get_valid_label(self): + """ + get valid data label + """ + if self.valid_label.size == 0: + labels = [] + with io.open(self.args.valid_data_dir, "r", encoding="utf8") as f: + for line in f: + labels.append([int(line.strip().split("\t")[-1])]) + self.valid_label = np.array(labels) + return self.valid_label + + def get_test_label(self): + """ + get test data label + """ + if self.test_label.size == 0: + labels = [] + with io.open(self.args.test_data_dir, "r", encoding="utf8") as f: + for line in f: + labels.append([int(line.strip().split("\t")[-1])]) + self.test_label = np.array(labels) + return self.test_label diff --git a/examples/similarity_net/run.sh b/examples/similarity_net/run.sh new file mode 100644 index 0000000..69c4486 --- /dev/null +++ b/examples/similarity_net/run.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +export FLAGS_enable_parallel_graph=1 +export FLAGS_sync_nccl_allreduce=1 +export CUDA_VISIBLE_DEVICES=3 +export FLAGS_fraction_of_gpu_memory_to_use=0.95 +TASK_NAME='simnet' +TRAIN_DATA_PATH=./data/train_pairwise_data +VALID_DATA_PATH=./data/test_pairwise_data +TEST_DATA_PATH=./data/test_pairwise_data +INFER_DATA_PATH=./data/infer_data +VOCAB_PATH=./data/term2id.dict +CKPT_PATH=./model_files +TEST_RESULT_PATH=./test_result +INFER_RESULT_PATH=./infer_result +TASK_MODE='pairwise' +CONFIG_PATH=./config/bow_pairwise.json + +INIT_CHECKPOINT=./model_files/bow_pairwise/200 + + + +# run_train +train() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda False \ + --do_train True \ + --do_valid True \ + --do_infer False \ + --batch_size 128 \ + --train_data_dir ${TRAIN_DATA_PATH} \ + --valid_data_dir ${VALID_DATA_PATH} \ + --test_data_dir ${TEST_DATA_PATH} \ + --infer_data_dir ${INFER_DATA_PATH} \ + --output_dir ${CKPT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --epoch 40 \ + --save_steps 2000 \ + --validation_steps 200 \ + --compute_accuracy False \ + --lamda 0.958 \ + --task_mode ${TASK_MODE}\ + --init_checkpoint "" +} +#run_evaluate +evaluate() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ + --do_test True \ + --verbose_result True \ + --batch_size 128 \ + --test_data_dir ${TEST_DATA_PATH} \ + --test_result_path ${TEST_RESULT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --task_mode ${TASK_MODE} \ + --compute_accuracy False \ + --lamda 0.958 \ + --init_checkpoint ${INIT_CHECKPOINT} +} +# run_infer +infer() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ + --do_infer True \ + --batch_size 128 \ + --infer_data_dir ${INFER_DATA_PATH} \ + --infer_result_path ${INFER_RESULT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --task_mode ${TASK_MODE} \ + --init_checkpoint ${INIT_CHECKPOINT} +} + +main() { + local cmd=${1:-help} + case "${cmd}" in + train) + train "$@"; + ;; + eval) + evaluate "$@"; + ;; + infer) + infer "$@"; + ;; + help) + echo "Usage: ${BASH_SOURCE} {train|eval|infer}"; + return 0; + ;; + *) + echo "Unsupport commend [${cmd}]"; + echo "Usage: ${BASH_SOURCE} {train|eval|infer}"; + return 1; + ;; + esac +} +main "$@" \ No newline at end of file diff --git a/examples/similarity_net/run_classifier.py b/examples/similarity_net/run_classifier.py new file mode 100644 index 0000000..e5f120b --- /dev/null +++ b/examples/similarity_net/run_classifier.py @@ -0,0 +1,426 @@ +# -*- encoding: utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import six +import io +import warnings +import argparse +import multiprocessing + +import paddle +import paddle.fluid as fluid +from paddle.fluid.io import DataLoader +from functools import partial, reduce +import numpy as np +import reader +import config +from utils import load_vocab, import_class, get_accuracy, ArgConfig, print_arguments + +from paddle.incubate.hapi.metrics import Accuracy +from paddle.incubate.hapi.model import set_device, Model, Input, Loss, CrossEntropy + + +def train(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + # load auc method + metric = fluid.metrics.Auc(name="auc") + + def valid_and_test(pred_list, process, mode): + """ + return auc and acc + """ + pred_list = np.vstack(pred_list) + if mode == "test": + label_list = process.get_test_label() + elif mode == "valid": + label_list = process.get_valid_label() + if args.task_mode == "pairwise": + pred_list = (pred_list + 1) / 2 + pred_list = np.hstack( + (np.ones_like(pred_list) - pred_list, pred_list)) + metric.reset() + metric.update(pred_list, label_list) + auc = metric.eval() + if args.compute_accuracy: + acc = get_accuracy(pred_list, label_list, args.task_mode, + args.lamda) + return auc, acc + else: + return auc + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + loss = import_class("./nets/losses", conf_dict["loss"]["module_name"], + conf_dict["loss"]["class_name"])(conf_dict) + # Load Optimization method + learning_rate = conf_dict["optimizer"]["learning_rate"] + optimizer_name = conf_dict["optimizer"]["class_name"] + if optimizer_name == 'SGDOptimizer': + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate, parameter_list=model.parameters()) + elif optimizer_name == 'AdamOptimizer': + beta1 = conf_dict["optimizer"]["beta1"] + beta2 = conf_dict["optimizer"]["beta2"] + epsilon = conf_dict["optimizer"]["epsilon"] + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + parameter_list=model.parameters()) + + global_step = 0 + valid_step = 0 + losses = [] + + # define dataloader + simnet_process = reader.SimNetProcessor(args, vocab) + train_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_train_examples = simnet_process.get_reader("train", epoch=args.epoch) + train_pyreader.set_sample_list_generator( + fluid.io.batch( + get_train_examples, batch_size=args.batch_size), + places=device) + if args.do_valid: + valid_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_valid_examples = simnet_process.get_reader("valid") + valid_pyreader.set_sample_list_generator( + fluid.io.batch( + get_valid_examples, batch_size=args.batch_size), + places=device) + pred_list = [] + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right'), Input( + [None, 1], 'int64', name='neg_right') + ] + + model.prepare( + inputs=inputs, + optimizer=optimizer, + loss_function=loss, + device=device) + + for left, pos_right, neg_right in train_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + + final_loss = model.train_batch([input_left, pos_right, neg_right]) + print("train_steps: %d, train_loss: %f" % + (global_step, final_loss[0][0])) + losses.append(np.mean(final_loss)) + global_step += 1 + + if args.do_valid and global_step % args.validation_steps == 0: + for left, pos_right, neg_right in valid_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + + result, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list(result) + valid_step += 1 + + valid_result = valid_and_test(pred_list, simnet_process, + "valid") + if args.compute_accuracy: + valid_auc, valid_acc = valid_result + print( + "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, np.mean(losses))) + else: + valid_auc = valid_result + print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) + + if global_step % args.save_steps == 0: + model_save_dir = os.path.join(args.output_dir, + conf_dict["model_path"]) + model_path = os.path.join(model_save_dir, str(global_step)) + + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + model.save(model_path) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + label = [Input([None, 1], 'int64', name='neg_right')] + + model.prepare( + inputs=inputs, + optimizer=optimizer, + loss_function=loss, + device=device) + + for left, right, label in train_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_loss = model.train_batch([left, right], [label]) + print("train_steps: %d, train_loss: %f" % + (global_step, final_loss[0][0])) + losses.append(np.mean(final_loss)) + global_step += 1 + + if args.do_valid and global_step % args.validation_steps == 0: + for left, right, label in valid_pyreader(): + valid_left = fluid.layers.reshape(left, shape=[-1, 1]) + valid_right = fluid.layers.reshape(right, shape=[-1, 1]) + valid_label = fluid.layers.reshape(label, shape=[-1, 1]) + + result, _ = model.test_batch( + [valid_left, valid_right, valid_right]) + pred_list += list(result) + valid_step += 1 + + valid_result = valid_and_test(pred_list, simnet_process, + "valid") + if args.compute_accuracy: + valid_auc, valid_acc = valid_result + print( + "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, np.mean(losses))) + else: + valid_auc = valid_result + print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) + + if global_step % args.save_steps == 0: + model_save_dir = os.path.join(args.output_dir, + conf_dict["model_path"]) + model_path = os.path.join(model_save_dir, str(global_step)) + + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + model.save(model_path) + + +def test(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + metric = fluid.metrics.Auc(name="auc") + + def valid_and_test(pred_list, process, mode): + """ + return auc and acc + """ + pred_list = np.vstack(pred_list) + if mode == "test": + label_list = process.get_test_label() + elif mode == "valid": + label_list = process.get_valid_label() + if args.task_mode == "pairwise": + pred_list = (pred_list + 1) / 2 + pred_list = np.hstack( + (np.ones_like(pred_list) - pred_list, pred_list)) + metric.reset() + metric.update(pred_list, label_list) + auc = metric.eval() + if args.compute_accuracy: + acc = get_accuracy(pred_list, label_list, args.task_mode, + args.lamda) + return auc, acc + else: + return auc + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model.load(args.init_checkpoint) + + simnet_process = reader.SimNetProcessor(args, vocab) + test_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_test_examples = simnet_process.get_reader("test") + test_pyreader.set_sample_list_generator( + fluid.io.batch( + get_test_examples, batch_size=args.batch_size), + places=device) + + pred_list = [] + test_step = 0 + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right'), Input( + [None, 1], 'int64', name='pos_right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, pos_right, neg_right in test_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + final_pred, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list(final_pred) + test_step += 1 + + test_result = valid_and_test(pred_list, simnet_process, "test") + if args.compute_accuracy: + test_auc, test_acc = test_result + print("test_steps: %d, test_auc: %f, test_acc: %f" % + (test_step, test_auc, test_acc)) + else: + test_auc = test_result + print("test_steps: %d, test_auc: %f" % (test_step, test_auc)) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, right, label in test_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_pred = model.test_batch([left, right]) + print(final_pred) + pred_list += list(final_pred) + test_step += 1 + + test_result = valid_and_test(pred_list, simnet_process, "test") + if args.compute_accuracy: + test_auc, test_acc = test_result + print("test_steps: %d, test_auc: %f, test_acc: %f" % + (test_step, test_auc, test_acc)) + else: + test_auc = test_result + print("test_steps: %d, test_auc: %f" % (test_step, test_auc)) + + +def infer(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model.load(args.init_checkpoint) + + simnet_process = reader.SimNetProcessor(args, vocab) + get_infer_examples = simnet_process.get_infer_reader + infer_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + infer_pyreader.set_sample_list_generator( + fluid.io.batch( + get_infer_examples, batch_size=args.batch_size), + places=device) + pred_list = [] + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, pos_right in infer_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + final_pred, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), final_pred)) + print(pred_list) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + # label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_pred = model.test_batch([left, right]) + print(final_pred) + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), final_pred)) + + with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: + for _data, _pred in zip(simnet_process.get_infer_data(), + int(pred_list)): + infer_file.write(_data + "\t" + _pred + "\n") + + +if __name__ == '__main__': + args = ArgConfig() + args = args.build_conf() + print_arguments(args) + conf_dict = config.SimNetConfig(args) + + if args.do_train: + train(conf_dict, args) + elif args.do_test: + test(conf_dict, args) + elif args.do_infer: + infer(conf_dict, args) + else: + raise ValueError("one of do_train and do_infer must be True") diff --git a/examples/similarity_net/utils.py b/examples/similarity_net/utils.py new file mode 100644 index 0000000..e62b999 --- /dev/null +++ b/examples/similarity_net/utils.py @@ -0,0 +1,244 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet utilities. +""" +import argparse +import time +import sys +import re +import os +import six +import numpy as np +import paddle.fluid as fluid +import io +import pickle +import warnings +from functools import partial +from hapi.configure import ArgumentGroup, str2bool +""" +******functions for file processing****** +""" + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + f = io.open(file_path, "r", encoding="utf8") + for line in f: + items = line.strip("\n").split("\t") + if items[0] not in vocab: + vocab[items[0]] = int(items[1]) + vocab[""] = 0 + return vocab + + +def get_result_file(args): + """ + Get Result File + Args: + conf_dict: Input path config + samples_file_path: Data path of real training + predictions_file_path: Prediction results path + Returns: + result_file: merge sample and predict result + + """ + with io.open(args.test_data_dir, "r", encoding="utf8") as test_file: + with io.open( + "predictions.txt", "r", encoding="utf8") as predictions_file: + with io.open( + args.test_result_path, "w", + encoding="utf8") as test_result_file: + test_datas = [line.strip("\n") for line in test_file] + predictions = [line.strip("\n") for line in predictions_file] + for test_data, prediction in zip(test_datas, predictions): + test_result_file.write(test_data + "\t" + prediction + + "\n") + os.remove("predictions.txt") + + +def import_class(module_path, module_name, class_name): + """ + Load class dynamically + Args: + module_path: The current path of the module + module_name: The module name + class_name: The name of class in the import module + Return: + Return the attribute value of the class object + """ + if module_path: + sys.path.append(module_path) + module = __import__(module_name) + return getattr(module, class_name) + + +""" +******functions for string processing****** +""" + + +def pattern_match(pattern, line): + """ + Check whether a string is matched + Args: + pattern: mathing pattern + line : input string + Returns: + True/False + """ + if re.match(pattern, line): + return True + else: + return False + + +""" +******functions for parameter processing****** +""" + + +def print_progress(task_name, percentage, style=0): + """ + Print progress bar + Args: + task_name: The name of the current task + percentage: Current progress + style: Progress bar form + """ + styles = ['#', '█'] + mark = styles[style] * percentage + mark += ' ' * (100 - percentage) + status = '%d%%' % percentage if percentage < 100 else 'Finished' + sys.stdout.write('%+20s [%s] %s\r' % (task_name, mark, status)) + sys.stdout.flush() + time.sleep(0.002) + + +class ArgConfig(object): + def __init__(self): + parser = argparse.ArgumentParser() + + model_g = ArgumentGroup(parser, "model", + "model configuration and paths.") + model_g.add_arg("config_path", str, None, + "Path to the json file for EmoTect model config.") + model_g.add_arg("init_checkpoint", str, None, + "Init checkpoint to resume training from.") + model_g.add_arg("output_dir", str, None, + "Directory path to save checkpoints") + model_g.add_arg("task_mode", str, None, + "task mode: pairwise or pointwise") + + train_g = ArgumentGroup(parser, "training", "training options.") + train_g.add_arg("epoch", int, 10, "Number of epoches for training.") + train_g.add_arg("save_steps", int, 20, + "The steps interval to save checkpoints.") + train_g.add_arg("validation_steps", int, 100, + "The steps interval to evaluate model performance.") + + infer_g = ArgumentGroup(parser, "inferring", "inferring related") + infer_g.add_arg("test_result_path", str, "test_result", + "Directory path to test result.") + infer_g.add_arg("infer_result_path", str, "infer_result.txt", + "Directory path to infer result.") + + data_g = ArgumentGroup( + parser, "data", + "Data paths, vocab paths and data processing options") + data_g.add_arg("train_data_dir", str, None, + "Directory path to training data.") + data_g.add_arg("valid_data_dir", str, None, + "Directory path to valid data.") + data_g.add_arg("test_data_dir", str, None, + "Directory path to testing data.") + data_g.add_arg("infer_data_dir", str, None, + "Directory path to infer data.") + data_g.add_arg("vocab_path", str, None, "Vocabulary path.") + data_g.add_arg("batch_size", int, 32, + "Total examples' number in batch for training.") + data_g.add_arg("seq_len", int, 32, "The length of each sentence.") + + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") + run_type_g.add_arg("use_cuda", bool, False, + "If set, use GPU for training.") + run_type_g.add_arg( + "task_name", str, None, + "The name of task to perform sentiment classification.") + run_type_g.add_arg("do_train", bool, False, + "Whether to perform training.") + run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.") + #run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.") + run_type_g.add_arg("do_infer", bool, False, + "Whether to perform inference.") + run_type_g.add_arg("compute_accuracy", bool, False, + "Whether to compute accuracy.") + run_type_g.add_arg( + "lamda", float, 0.91, + "When task_mode is pairwise, lamda is the threshold for calculating the accuracy." + ) + + custom_g = ArgumentGroup(parser, "customize", "customized options.") + self.custom_g = custom_g + + #parser.add_argument('--enable_ce',action='store_true',help='If set, run the task with continuous evaluation logs.') + + self.parser = parser + + def add_arg(self, name, dtype, default, descrip): + self.custom_g.add_arg(name, dtype, default, descrip) + + def build_conf(self): + return self.parser.parse_args() + + +def print_arguments(args): + """ + Print Arguments + """ + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def get_softmax(preds): + """ + compute sotfmax + """ + _exp = np.exp(preds) + return _exp / np.sum(_exp, axis=1, keepdims=True) + + +def get_sigmoid(preds): + """ + compute sigmoid + """ + return 1 / (1 + np.exp(-preds)) + + +def get_accuracy(preds, labels, mode, lamda=0.958): + """ + compute accuracy + """ + if mode == "pairwise": + preds = np.array(list(map(lambda x: 1 if x[1] >= lamda else 0, preds))) + else: + preds = np.array(list(map(lambda x: np.argmax(x), preds))) + labels = np.squeeze(labels) + return np.mean(preds[0:len(labels)] == labels) From c138701d7aef7c31047881d9eac4c66ef657f424 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 3 Jun 2020 04:17:12 +0000 Subject: [PATCH 6/7] add hapi/simnet_model --- examples/similarity_net/README.md | 14 +- examples/similarity_net/nets/bow.py | 28 +-- examples/similarity_net/nets/cnn.py | 63 +++--- examples/similarity_net/nets/gru.py | 103 ++++++++++ .../similarity_net/nets/losses/hinge_loss.py | 6 +- .../similarity_net/nets/losses/log_loss.py | 2 +- .../nets/losses/softmax_cross_entropy_loss.py | 26 ++- examples/similarity_net/nets/lstm.py | 119 ++++++++++++ examples/similarity_net/reader.py | 4 +- examples/similarity_net/run.sh | 13 +- examples/similarity_net/run_classifier.py | 183 ++++++++---------- examples/similarity_net/utils.py | 5 +- 12 files changed, 376 insertions(+), 190 deletions(-) create mode 100644 examples/similarity_net/nets/gru.py create mode 100644 examples/similarity_net/nets/lstm.py diff --git a/examples/similarity_net/README.md b/examples/similarity_net/README.md index 4f7270b..a1b64a4 100644 --- a/examples/similarity_net/README.md +++ b/examples/similarity_net/README.md @@ -3,16 +3,6 @@ ### 任务说明 短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。 -### 效果说明 -基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和进行评测,效果如下表所示。 - - -| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | -|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| -| | AUC | AUC | AUC|正逆序比| -|BOW_Pairwise|0.6815|0.7331|0.7638|1.5565| - - #### 测试集说明 | 数据集 | 来源 | 垂类 | |:-----------:|:-------------:|:-------------:| @@ -29,9 +19,9 @@ #### 安装代码 克隆工具集代码库到本地 ```shell -git clone https://github.com/PaddlePaddle/models.git +git clone https://github.com/PaddlePaddle/hapi.git -cd models/dygraph/similarity_net +cd hapi/examples/similarity_net ``` #### 数据准备 下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 diff --git a/examples/similarity_net/nets/bow.py b/examples/similarity_net/nets/bow.py index bbe9b14..ee4fe49 100644 --- a/examples/similarity_net/nets/bow.py +++ b/examples/similarity_net/nets/bow.py @@ -14,6 +14,7 @@ """ bow class """ +import numpy as np import paddle.fluid as fluid from paddle.fluid.dygraph import Linear, Layer, Embedding from paddle.incubate.hapi.model import Model @@ -25,11 +26,10 @@ class BOWEncoder(Layer): simple BOWEncoder for simnet """ - def __init__(self, dict_size, bow_dim, seq_len, emb_dim, padding_idx): + def __init__(self, dict_size, bow_dim, emb_dim, padding_idx): super(BOWEncoder, self).__init__() self.dict_size = dict_size self.bow_dim = bow_dim - self.seq_len = seq_len self.emb_dim = emb_dim self.padding_idx = padding_idx self.emb_layer = Embedding( @@ -41,28 +41,20 @@ def __init__(self, dict_size, bow_dim, seq_len, emb_dim, padding_idx): def forward(self, input): emb = self.emb_layer(input) - emb_reshape = fluid.layers.reshape( - emb, shape=[-1, self.seq_len, self.bow_dim]) - bow_emb = fluid.layers.reduce_sum(emb_reshape, dim=1) + bow_emb = fluid.layers.reduce_sum(emb, dim=1) return bow_emb class Pair_BOWModel(Model): - """ - classify model - """ - def __init__(self, conf_dict): super(Pair_BOWModel, self).__init__() self.dict_size = conf_dict["dict_size"] - self.task_mode = conf_dict["task_mode"] self.emb_dim = conf_dict["net"]["emb_dim"] self.bow_dim = conf_dict["net"]["bow_dim"] - self.seq_len = conf_dict["seq_len"] self.padding_idx = None - self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, - self.emb_dim, self.padding_idx) + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.emb_dim, + self.padding_idx) self.bow_layer = Linear( input_dim=self.bow_dim, output_dim=self.bow_dim) @@ -83,21 +75,15 @@ def forward(self, left, pos_right, neg_right): class Point_BOWModel(Model): - """ - classify model - """ - def __init__(self, conf_dict): super(Point_BOWModel, self).__init__() self.dict_size = conf_dict["dict_size"] - self.task_mode = conf_dict["task_mode"] self.emb_dim = conf_dict["net"]["emb_dim"] self.bow_dim = conf_dict["net"]["bow_dim"] - self.seq_len = conf_dict["seq_len"] self.padding_idx = None - self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, - self.emb_dim, self.padding_idx) + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.emb_dim, + self.padding_idx) self.bow_layer_po = Linear( input_dim=self.bow_dim * 2, output_dim=self.bow_dim) self.softmax_layer = Linear( diff --git a/examples/similarity_net/nets/cnn.py b/examples/similarity_net/nets/cnn.py index 97dbb2f..89769f6 100644 --- a/examples/similarity_net/nets/cnn.py +++ b/examples/similarity_net/nets/cnn.py @@ -15,27 +15,27 @@ cnn class """ import paddle.fluid as fluid -from paddle.fluid.dygraph import Linear, Layer, Conv2D, Pool2D +from paddle.fluid.dygraph import Linear, Layer, Conv2D, Pool2D, Embedding from paddle.incubate.hapi.model import Model from paddle.incubate.hapi.text.text import CNNEncoder class Pair_CNNModel(Model): - """ - classify model - """ - def __init__(self, conf_dict): super(Pair_CNNModel, self).__init__() self.dict_size = conf_dict["dict_size"] - self.task_mode = conf_dict["task_mode"] self.emb_dim = conf_dict["net"]["emb_dim"] self.filter_size = conf_dict["net"]["filter_size"] self.num_filters = conf_dict["net"]["num_filters"] self.hidden_dim = conf_dict["net"]["hidden_dim"] - self.seq_len = conf_dict["seq_len"] self.padding_idx = None - #layers + + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) self.encoder_layer = CNNEncoder( num_channels=1, num_filters=self.num_filters, @@ -44,24 +44,18 @@ def __init__(self, conf_dict): layer_num=1, act='relu') self.fc_layer = Linear( - input_dim=self.num_filters * self.seq_len, - output_dim=self.hidden_dim) - self.fc_layer_po = Linear( - input_dim=self.num_filters * self.seq_len * 2, - output_dim=self.hidden_dim) - self.softmax_layer = Linear( - input_dim=self.hidden_dim, output_dim=2, act='softmax') + input_dim=self.num_filters, output_dim=self.hidden_dim) def forward(self, left, pos_right, neg_right): - left = fluid.layers.reshape( - left, shape=[-1, self.seq_len, self.hidden_dim]) - pos_right = fluid.layers.reshape( - pos_right, shape=[-1, self.seq_len, self.hidden_dim]) - neg_right = fluid.layers.reshape( - neg_right, shape=[-1, self.seq_len, self.hidden_dim]) + left = self.emb_layer(left) + pos_right = self.emb_layer(pos_right) + neg_right = self.emb_layer(neg_right) left_cnn = self.encoder_layer(left) + left_cnn = fluid.layers.transpose(left_cnn, perm=[0, 2, 1]) pos_right_cnn = self.encoder_layer(pos_right) + pos_right_cnn = fluid.layers.transpose(pos_right_cnn, perm=[0, 2, 1]) neg_right_cnn = self.encoder_layer(neg_right) + neg_right_cnn = fluid.layers.transpose(neg_right_cnn, perm=[0, 2, 1]) left_fc = self.fc_layer(left_cnn) pos_right_fc = self.fc_layer(pos_right_cnn) neg_right_fc = self.fc_layer(neg_right_cnn) @@ -71,10 +65,6 @@ def forward(self, left, pos_right, neg_right): class Point_CNNModel(Model): - """ - classify model - """ - def __init__(self, conf_dict): super(Point_CNNModel, self).__init__() self.dict_size = conf_dict["dict_size"] @@ -85,7 +75,13 @@ def __init__(self, conf_dict): self.hidden_dim = conf_dict["net"]["hidden_dim"] self.seq_len = conf_dict["seq_len"] self.padding_idx = None - #layers + + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) self.encoder_layer = CNNEncoder( num_channels=1, num_filters=self.num_filters, @@ -93,22 +89,19 @@ def __init__(self, conf_dict): pool_size=1, layer_num=1, act='relu') - self.fc_layer = Linear( - input_dim=self.num_filters * self.seq_len, - output_dim=self.hidden_dim) + self.fc_layer_po = Linear( - input_dim=self.num_filters * self.seq_len * 2, - output_dim=self.hidden_dim) + input_dim=self.num_filters * 2, output_dim=self.hidden_dim) self.softmax_layer = Linear( input_dim=self.hidden_dim, output_dim=2, act='softmax') def forward(self, left, right): - left = fluid.layers.reshape( - left, shape=[-1, self.seq_len, self.hidden_dim]) - right = fluid.layers.reshape( - right, shape=[-1, self.seq_len, self.hidden_dim]) + left = self.emb_layer(left) + right = self.emb_layer(right) left_cnn = self.encoder_layer(left) + left_cnn = fluid.layers.transpose(left_cnn, perm=[0, 2, 1]) right_cnn = self.encoder_layer(right) + right_cnn = fluid.layers.transpose(right_cnn, perm=[0, 2, 1]) concat = fluid.layers.concat([left_cnn, right_cnn], axis=1) concat_fc = self.fc_layer_po(concat) pred = self.softmax_layer(concat_fc) diff --git a/examples/similarity_net/nets/gru.py b/examples/similarity_net/nets/gru.py new file mode 100644 index 0000000..c68c13c --- /dev/null +++ b/examples/similarity_net/nets/gru.py @@ -0,0 +1,103 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +gru class +""" +import numpy as np +from paddle.fluid.dygraph import Layer, to_variable, Embedding, Linear, GRUUnit +import paddle.fluid as fluid + +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.text.text import RNN, BasicGRUCell + + +class GRUEncoder(Layer): + def __init__(self, dict_size, emb_dim, gru_dim, hidden_dim, padding_idx): + super(GRUEncoder, self).__init__() + self.dict_size = dict_size + self.emb_dim = emb_dim + self.gru_dim = gru_dim + self.hidden_dim = hidden_dim + self.padding_idx = padding_idx + + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) + cell = BasicGRUCell( + input_size=self.gru_dim * 3, hidden_size=self.hidden_dim) + self.gru_layer = RNN(cell=cell) + self.proj_layer = Linear( + input_dim=self.hidden_dim, output_dim=self.gru_dim * 3) + + def forward(self, input): + emb = self.emb_layer(input) + emb_proj = self.proj_layer(emb) + gru, _ = self.gru_layer(emb_proj) + gru = fluid.layers.reduce_max(gru, dim=1) + gru = fluid.layers.tanh(gru) + return gru + + +class Pair_GRUModel(Model): + def __init__(self, conf_dict): + super(Pair_GRUModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.gru_dim = conf_dict["net"]["gru_dim"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.padding_idx = None + self.emb_layer = GRUEncoder(self.dict_size, self.emb_dim, self.gru_dim, + self.hidden_dim, self.padding_idx) + self.fc_layer = Linear( + input_dim=self.hidden_dim, output_dim=self.hidden_dim) + + def forward(self, left, pos_right, neg_right): + left_emb = self.emb_layer(left) + pos_right_emb = self.emb_layer(pos_right) + neg_right_emb = self.emb_layer(neg_right) + left_fc = self.fc_layer(left_emb) + pos_right_fc = self.fc_layer(pos_right_emb) + neg_right_fc = self.fc_layer(neg_right_emb) + pos_pred = fluid.layers.cos_sim(left_fc, pos_right_fc) + neg_pred = fluid.layers.cos_sim(left_fc, neg_right_fc) + return pos_pred, neg_pred + + +class Point_GRUModel(Model): + def __init__(self, conf_dict): + super(Point_GRUModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.gru_dim = conf_dict["net"]["gru_dim"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.padding_idx = None + self.emb_layer = GRUEncoder(self.dict_size, self.emb_dim, self.gru_dim, + self.hidden_dim, self.padding_idx) + self.fc_layer_fo = Linear( + input_dim=self.hidden_dim * 2, output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + left_emb = self.emb_layer(left) + right_emb = self.emb_layer(right) + concat = fluid.layers.concat([left_emb, right_emb], axis=1) + concat_fc = self.fc_layer_fo(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/nets/losses/hinge_loss.py b/examples/similarity_net/nets/losses/hinge_loss.py index 6081f8f..317e18d 100644 --- a/examples/similarity_net/nets/losses/hinge_loss.py +++ b/examples/similarity_net/nets/losses/hinge_loss.py @@ -18,7 +18,7 @@ import sys sys.path.append("../") import paddle.fluid as fluid -from paddle.incubate.hapi.model import Loss +from paddle.incubate.hapi.loss import Loss class HingeLoss(Loss): @@ -34,6 +34,6 @@ def forward(self, outputs, labels=None): neg, neg.shape, "float32", self.margin) sub = fluid.layers.elementwise_sub(neg, pos) add = fluid.layers.elementwise_add(sub, loss_margin) - loss_max = fluid.layers.elementwise_max(loss, add) - loss_last = fluid.layers.reduce_mean(loss_max) + max = fluid.layers.elementwise_max(loss, add) + loss_last = fluid.layers.reduce_mean(max) return loss_last diff --git a/examples/similarity_net/nets/losses/log_loss.py b/examples/similarity_net/nets/losses/log_loss.py index a11d1e4..26d99a0 100644 --- a/examples/similarity_net/nets/losses/log_loss.py +++ b/examples/similarity_net/nets/losses/log_loss.py @@ -18,7 +18,7 @@ import sys sys.path.append("../") import paddle.fluid as fluid -from paddle.incubate.hapi.model import Loss +from paddle.incubate.hapi.loss import Loss class LogLoss(Loss): diff --git a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py index ec19c5a..b410040 100644 --- a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py +++ b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py @@ -18,14 +18,24 @@ import sys sys.path.append("../") import paddle.fluid as fluid -from paddle.incubate.hapi.model import Loss +from hapi.model import Loss +''' +class SoftmaxCrossEntropyLoss(Loss): + def __init__(self,conf_dict): + super(SoftmaxCrossEntropyLoss,self).__init__() + def forward(self,input,label): + cost=fluid.layers.cross_entropy(input=input,label=label) + avg_cost=fluid.layers.reduce_mean(cost) + return avg_cost +''' -class SoftmxCrossEntropyLoss(Loss): - def __init__(self, conf_dict): - super(SoftmxCrossEntropyLoss, self).__init__() - def forward(self, input, label): - cost = fluid.layers.cross_entropy(input=input, label=label) - avg_cost = fluid.layers.reduce_mean(cost) - return avg_cost +class SoftmaxCrossEntropyLoss(Loss): + def __init__(self, conf_dict, average=True): + super(SoftmaxCrossEntropyLoss, self).__init__() + + def forward(self, outputs, labels): + return [ + fluid.layers.cross_entropy(o, l) for o, l in zip(outputs, labels) + ] diff --git a/examples/similarity_net/nets/lstm.py b/examples/similarity_net/nets/lstm.py new file mode 100644 index 0000000..b1eba70 --- /dev/null +++ b/examples/similarity_net/nets/lstm.py @@ -0,0 +1,119 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +gru class +""" +import numpy as np +from paddle.fluid.dygraph import Layer, Embedding, Linear +import paddle.fluid as fluid + +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.text.text import BasicLSTMCell, RNN + + +class LSTMEncoder(Layer): + def __init__(self, dict_size, emb_dim, lstm_dim, hidden_dim, padding_idx): + + super(LSTMEncoder, self).__init__() + self.dict_size = dict_size + self.emb_dim = emb_dim + self.lstm_dim = lstm_dim + self.hidden_dim = hidden_dim + self.is_reverse = False + self.padding_idx = padding_idx + + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) + + self.lstm_cell = BasicLSTMCell( + input_size=self.lstm_dim * 4, hidden_size=self.lstm_dim) + self.lstm_layer = RNN(cell=self.lstm_cell, + time_major=True, + is_reverse=self.is_reverse) + self.proj_layer = Linear( + input_dim=self.emb_dim, output_dim=self.lstm_dim * 4) + + def forward(self, input): + emb = self.emb_layer(input) + emb_proj = self.proj_layer(emb) + emb_lstm, _ = self.lstm_layer(emb_proj) + emb_reduce = fluid.layers.reduce_max(emb_lstm, dim=1) + emb_out = fluid.layers.tanh(emb_reduce) + return emb_out + + +class Pair_LSTMModel(Model): + def __init__(self, conf_dict): + super(Pair_LSTMModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.lstm_dim = conf_dict["net"]["lstm_dim"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.padding_idx = None + + self.emb_layer = LSTMEncoder(self.dict_size, self.emb_dim, + self.lstm_dim, self.hidden_dim, + self.padding_idx) + + self.fc_layer = Linear( + input_dim=self.hidden_dim, output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.hidden_dim * 2, output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, pos_right, neg_right): + left_emb = self.emb_layer(left) + pos_right_emb = self.emb_layer(pos_right) + neg_right_emb = self.emb_layer(neg_right) + left_fc = self.fc_layer(left_emb) + pos_right_fc = self.fc_layer(pos_right_emb) + neg_right_fc = self.fc_layer(neg_right_emb) + pos_pred = fluid.layers.cos_sim(left_fc, pos_right_fc) + neg_pred = fluid.layers.cos_sim(left_fc, neg_right_fc) + return pos_pred, neg_pred + + +class Point_LSTMModel(Model): + def __init__(self, conf_dict): + super(Point_LSTMModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.lstm_dim = conf_dict["net"]["lstm_dim"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.padding_idx = None + + self.emb_layer = LSTMEncoder(self.dict_size, self.emb_dim, + self.lstm_dim, self.hidden_dim, + self.padding_idx) + + self.fc_layer = Linear( + input_dim=self.hidden_dim, output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.hidden_dim * 2, output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + left_emb = self.emb_layer(left) + right_emb = self.emb_layer(right) + concat = fluid.layers.concat([left_emb, right_emb], axis=1) + concat_fc = self.fc_layer_po(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/reader.py b/examples/similarity_net/reader.py index d61a83c..e483229 100644 --- a/examples/similarity_net/reader.py +++ b/examples/similarity_net/reader.py @@ -184,7 +184,7 @@ def reader_with_pointwise(): query = self.padding_text(query) title = self.padding_text(title) - lebel = int(label) + label = int(label) yield [query, title, label] else: @@ -246,7 +246,7 @@ def get_infer_reader(self): yield [query, title] - def get_infer_pairdata(self): + def get_infer_data(self): """ get infer data """ diff --git a/examples/similarity_net/run.sh b/examples/similarity_net/run.sh index 69c4486..519a818 100644 --- a/examples/similarity_net/run.sh +++ b/examples/similarity_net/run.sh @@ -15,7 +15,7 @@ INFER_RESULT_PATH=./infer_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/bow_pairwise/200 +INIT_CHECKPOINT=./model_files/bow_pairwise/20 @@ -36,9 +36,9 @@ train() { --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ --epoch 40 \ - --save_steps 2000 \ - --validation_steps 200 \ - --compute_accuracy False \ + --save_steps 10 \ + --validation_steps 2 \ + --compute_accuracy True \ --lamda 0.958 \ --task_mode ${TASK_MODE}\ --init_checkpoint "" @@ -49,14 +49,13 @@ evaluate() { --task_name ${TASK_NAME} \ --use_cuda false \ --do_test True \ - --verbose_result True \ --batch_size 128 \ --test_data_dir ${TEST_DATA_PATH} \ --test_result_path ${TEST_RESULT_PATH} \ --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ --task_mode ${TASK_MODE} \ - --compute_accuracy False \ + --compute_accuracy True \ --lamda 0.958 \ --init_checkpoint ${INIT_CHECKPOINT} } @@ -98,4 +97,4 @@ main() { ;; esac } -main "$@" \ No newline at end of file +main "$@" diff --git a/examples/similarity_net/run_classifier.py b/examples/similarity_net/run_classifier.py index e5f120b..04f3222 100644 --- a/examples/similarity_net/run_classifier.py +++ b/examples/similarity_net/run_classifier.py @@ -34,39 +34,38 @@ from utils import load_vocab, import_class, get_accuracy, ArgConfig, print_arguments from paddle.incubate.hapi.metrics import Accuracy -from paddle.incubate.hapi.model import set_device, Model, Input, Loss, CrossEntropy +from paddle.incubate.hapi.model import set_device, Model, Input +from paddle.incubate.hapi.loss import Loss + + +# define auc method +def valid_and_test(pred_list, process, mode): + """ + return auc and acc + """ + metric = fluid.metrics.Auc(name="auc") + pred_list = np.vstack(pred_list) + if mode == "test": + label_list = process.get_test_label() + elif mode == "valid": + label_list = process.get_valid_label() + if args.task_mode == "pairwise": + pred_list = (pred_list + 1) / 2 + pred_list = np.hstack((np.ones_like(pred_list) - pred_list, pred_list)) + metric.reset() + metric.update(pred_list, label_list) + auc = metric.eval() + if args.compute_accuracy: + acc = get_accuracy(pred_list, label_list, args.task_mode, args.lamda) + return auc, acc + else: + return auc def train(conf_dict, args): device = set_device("cpu") fluid.enable_dygraph(device) - # load auc method - metric = fluid.metrics.Auc(name="auc") - - def valid_and_test(pred_list, process, mode): - """ - return auc and acc - """ - pred_list = np.vstack(pred_list) - if mode == "test": - label_list = process.get_test_label() - elif mode == "valid": - label_list = process.get_valid_label() - if args.task_mode == "pairwise": - pred_list = (pred_list + 1) / 2 - pred_list = np.hstack( - (np.ones_like(pred_list) - pred_list, pred_list)) - metric.reset() - metric.update(pred_list, label_list) - auc = metric.eval() - if args.compute_accuracy: - acc = get_accuracy(pred_list, label_list, args.task_mode, - args.lamda) - return auc, acc - else: - return auc - # loading vocabulary vocab = load_vocab(args.vocab_path) # get vocab size @@ -120,9 +119,9 @@ def valid_and_test(pred_list, process, mode): if args.task_mode == "pairwise": inputs = [ Input( - [None, 1], 'int64', name='input_left'), Input( - [None, 1], 'int64', name='pos_right'), Input( - [None, 1], 'int64', name='neg_right') + [None, args.seq_len], 'int64', name='input_left'), Input( + [None, args.seq_len], 'int64', name='pos_right'), Input( + [None, args.seq_len], 'int64', name='neg_right') ] model.prepare( @@ -132,9 +131,11 @@ def valid_and_test(pred_list, process, mode): device=device) for left, pos_right, neg_right in train_pyreader(): - input_left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + input_left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) + neg_right = fluid.layers.reshape( + neg_right, shape=[-1, args.seq_len]) final_loss = model.train_batch([input_left, pos_right, neg_right]) print("train_steps: %d, train_loss: %f" % @@ -144,26 +145,29 @@ def valid_and_test(pred_list, process, mode): if args.do_valid and global_step % args.validation_steps == 0: for left, pos_right, neg_right in valid_pyreader(): - input_left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + input_left = fluid.layers.reshape( + left, shape=[-1, args.seq_len]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) + neg_right = fluid.layers.reshape( + neg_right, shape=[-1, args.seq_len]) result, _ = model.test_batch( [input_left, pos_right, neg_right]) - pred_list += list(result) - valid_step += 1 + pred_list = list(result) + valid_step += 1 valid_result = valid_and_test(pred_list, simnet_process, "valid") if args.compute_accuracy: valid_auc, valid_acc = valid_result print( "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" - % (global_step, valid_auc, valid_acc, np.mean(losses))) + % (valid_step, valid_auc, valid_acc, np.mean(losses))) else: valid_auc = valid_result print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % - (global_step, valid_auc, np.mean(losses))) + (valid_step, valid_auc, np.mean(losses))) if global_step % args.save_steps == 0: model_save_dir = os.path.join(args.output_dir, @@ -177,20 +181,21 @@ def valid_and_test(pred_list, process, mode): else: inputs = [ Input( - [None, 1], 'int64', name='left'), Input( - [None, 1], 'int64', name='right') + [None, args.seq_len], 'int64', name='left'), Input( + [None, args.seq_len], 'int64', name='right') ] label = [Input([None, 1], 'int64', name='neg_right')] model.prepare( inputs=inputs, + labels=label, optimizer=optimizer, loss_function=loss, device=device) for left, right, label in train_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - right = fluid.layers.reshape(right, shape=[-1, 1]) + left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + right = fluid.layers.reshape(right, shape=[-1, args.seq_len]) label = fluid.layers.reshape(label, shape=[-1, 1]) final_loss = model.train_batch([left, right], [label]) @@ -201,26 +206,27 @@ def valid_and_test(pred_list, process, mode): if args.do_valid and global_step % args.validation_steps == 0: for left, right, label in valid_pyreader(): - valid_left = fluid.layers.reshape(left, shape=[-1, 1]) - valid_right = fluid.layers.reshape(right, shape=[-1, 1]) + valid_left = fluid.layers.reshape( + left, shape=[-1, args.seq_len]) + valid_right = fluid.layers.reshape( + right, shape=[-1, args.seq_len]) valid_label = fluid.layers.reshape(label, shape=[-1, 1]) - result, _ = model.test_batch( - [valid_left, valid_right, valid_right]) + result = model.test_batch([valid_left, valid_right]) pred_list += list(result) - valid_step += 1 + valid_step += 1 valid_result = valid_and_test(pred_list, simnet_process, "valid") if args.compute_accuracy: valid_auc, valid_acc = valid_result print( "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" - % (global_step, valid_auc, valid_acc, np.mean(losses))) + % (valid_step, valid_auc, valid_acc, np.mean(losses))) else: valid_auc = valid_result print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % - (global_step, valid_auc, np.mean(losses))) + (valid_step, valid_auc, np.mean(losses))) if global_step % args.save_steps == 0: model_save_dir = os.path.join(args.output_dir, @@ -236,31 +242,6 @@ def test(conf_dict, args): device = set_device("cpu") fluid.enable_dygraph(device) - metric = fluid.metrics.Auc(name="auc") - - def valid_and_test(pred_list, process, mode): - """ - return auc and acc - """ - pred_list = np.vstack(pred_list) - if mode == "test": - label_list = process.get_test_label() - elif mode == "valid": - label_list = process.get_valid_label() - if args.task_mode == "pairwise": - pred_list = (pred_list + 1) / 2 - pred_list = np.hstack( - (np.ones_like(pred_list) - pred_list, pred_list)) - metric.reset() - metric.update(pred_list, label_list) - auc = metric.eval() - if args.compute_accuracy: - acc = get_accuracy(pred_list, label_list, args.task_mode, - args.lamda) - return auc, acc - else: - return auc - # loading vocabulary vocab = load_vocab(args.vocab_path) # get vocab size @@ -286,17 +267,19 @@ def valid_and_test(pred_list, process, mode): if args.task_mode == "pairwise": inputs = [ Input( - [None, 1], 'int64', name='input_left'), Input( - [None, 1], 'int64', name='pos_right'), Input( - [None, 1], 'int64', name='pos_right') + [None, args.seq_len], 'int64', name='input_left'), Input( + [None, args.seq_len], 'int64', name='pos_right'), Input( + [None, args.seq_len], 'int64', name='pos_right') ] model.prepare(inputs=inputs, device=device) for left, pos_right, neg_right in test_pyreader(): - input_left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + input_left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) + neg_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) final_pred, _ = model.test_batch( [input_left, pos_right, neg_right]) @@ -315,15 +298,15 @@ def valid_and_test(pred_list, process, mode): else: inputs = [ Input( - [None, 1], 'int64', name='left'), Input( - [None, 1], 'int64', name='right') + [None, args.seq_len], 'int64', name='left'), Input( + [None, args.seq_len], 'int64', name='right') ] model.prepare(inputs=inputs, device=device) for left, right, label in test_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - right = fluid.layers.reshape(right, shape=[-1, 1]) + left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + right = fluid.layers.reshape(right, shape=[-1, args.seq_len]) label = fluid.layers.reshape(label, shape=[-1, 1]) final_pred = model.test_batch([left, right]) @@ -368,16 +351,19 @@ def infer(conf_dict, args): if args.task_mode == "pairwise": inputs = [ Input( - [None, 1], 'int64', name='input_left'), Input( - [None, 1], 'int64', name='pos_right') + [None, args.seq_len], 'int64', name='input_left'), Input( + [None, args.seq_len], 'int64', name='pos_right'), Input( + [None, args.seq_len], 'int64', name='neg_right') ] model.prepare(inputs=inputs, device=device) for left, pos_right in infer_pyreader(): - input_left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + input_left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) + neg_right = fluid.layers.reshape( + pos_right, shape=[-1, args.seq_len]) final_pred, _ = model.test_batch( [input_left, pos_right, neg_right]) @@ -388,16 +374,15 @@ def infer(conf_dict, args): else: inputs = [ Input( - [None, 1], 'int64', name='left'), Input( - [None, 1], 'int64', name='right') + [None, args.seq_len], 'int64', name='left'), Input( + [None, args.seq_len], 'int64', name='right') ] model.prepare(inputs=inputs, device=device) for left, right in infer_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - right = fluid.layers.reshape(right, shape=[-1, 1]) - # label = fluid.layers.reshape(label, shape=[-1, 1]) + left = fluid.layers.reshape(left, shape=[-1, args.seq_len]) + right = fluid.layers.reshape(right, shape=[-1, args.seq_len]) final_pred = model.test_batch([left, right]) print(final_pred) @@ -405,8 +390,7 @@ def infer(conf_dict, args): map(lambda item: str((item[0] + 1) / 2), final_pred)) with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: - for _data, _pred in zip(simnet_process.get_infer_data(), - int(pred_list)): + for _data, _pred in zip(simnet_process.get_infer_data(), pred_list): infer_file.write(_data + "\t" + _pred + "\n") @@ -423,4 +407,5 @@ def infer(conf_dict, args): elif args.do_infer: infer(conf_dict, args) else: - raise ValueError("one of do_train and do_infer must be True") + raise ValueError( + "one of do_train and do_test and do_infer must be True") diff --git a/examples/similarity_net/utils.py b/examples/similarity_net/utils.py index e62b999..b5b313d 100644 --- a/examples/similarity_net/utils.py +++ b/examples/similarity_net/utils.py @@ -27,7 +27,7 @@ import pickle import warnings from functools import partial -from hapi.configure import ArgumentGroup, str2bool +from paddle.incubate.hapi.configure import ArgumentGroup, str2bool """ ******functions for file processing****** """ @@ -183,7 +183,8 @@ def __init__(self): run_type_g.add_arg("do_train", bool, False, "Whether to perform training.") run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.") - #run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.") + run_type_g.add_arg("do_test", bool, False, + "Whether to perform testing.") run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.") run_type_g.add_arg("compute_accuracy", bool, False, From 03dc76394d905537615e8a967f55750f6907e9df Mon Sep 17 00:00:00 2001 From: jinyuKing <2943829328@qq.com> Date: Wed, 3 Jun 2020 04:38:31 +0000 Subject: [PATCH 7/7] add hapi/simnet_model --- .../nets/losses/softmax_cross_entropy_loss.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py index b410040..9701538 100644 --- a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py +++ b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py @@ -18,17 +18,7 @@ import sys sys.path.append("../") import paddle.fluid as fluid -from hapi.model import Loss -''' -class SoftmaxCrossEntropyLoss(Loss): - def __init__(self,conf_dict): - super(SoftmaxCrossEntropyLoss,self).__init__() - - def forward(self,input,label): - cost=fluid.layers.cross_entropy(input=input,label=label) - avg_cost=fluid.layers.reduce_mean(cost) - return avg_cost -''' +from paddle.incubate.hapi.loss import Loss class SoftmaxCrossEntropyLoss(Loss):