diff --git a/models/cross-dimensional-attention/README.md b/models/cross-dimensional-attention/README.md new file mode 100644 index 0000000..b6e1a4f --- /dev/null +++ b/models/cross-dimensional-attention/README.md @@ -0,0 +1,24 @@ +# Cross-Dimensional-Attention Model + +## [CDSA: Cross-Dimensional Self-Attention for Multivariate, Geo-tagged Time Series Imputation](https://arxiv.org/pdf/1905.09904.pdf) +_Jiawei Ma, Zheng Shou, Alireza Zareian, Hassan Mansour, Anthony Vetro, Shih-Fu Chang_ + +Many real-world applications involve multivariate, geo-tagged time series data: at +each location, multiple sensors record corresponding measurements. For example, +air quality monitoring system records PM2.5, CO, etc. The resulting time-series +data often has missing values due to device outages or communication errors. In +order to impute the missing values, state-of-the-art methods are built on Recurrent +Neural Networks (RNN), which process each time stamp sequentially, prohibiting +the direct modeling of the relationship between distant time stamps. Recently, the +self-attention mechanism has been proposed for sequence modeling tasks such as +machine translation, significantly outperforming RNN because the relationship between each two time stamps can be modeled explicitly. In this paper, we are the first +to adapt the self-attention mechanism for multivariate, geo-tagged time series data. +In order to jointly capture the self-attention across multiple dimensions, including +time, location and the sensor measurements, while maintain low computational +complexity, we propose a novel approach called Cross-Dimensional Self-Attention +(CDSA) to process each dimension sequentially, yet in an order-independent manner. Our extensive experiments on four real-world datasets, including three standard +benchmarks and our newly collected NYC-traffic dataset, demonstrate that our +approach outperforms the state-of-the-art imputation and forecasting methods. A +detailed systematic analysis confirms the effectiveness of our design choices. + +![image](https://drive.google.com/uc?export=view&id=1qJzGTBewD-Q9gT7M1FaKqAKW7H6S4OBK) diff --git a/models/cross-dimensional-attention/tensorflow/Base_Handler.py b/models/cross-dimensional-attention/tensorflow/Base_Handler.py new file mode 100644 index 0000000..f9062a2 --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/Base_Handler.py @@ -0,0 +1,82 @@ +# uncompyle6 version 3.7.0 +# Python bytecode 2.7 (62211) +# Decompiled from: Python 3.6.9 (default, Apr 18 2020, 01:56:04) +# [GCC 8.4.0] +# Embedded file name: /home/jiawei/Tensor_MultiDim_NYC/Model/Base_Handler.py +# Compiled at: 2019-04-29 02:51:25 +import tensorflow as tf, numpy as np, yaml, os, h5py, time, sys, math +from Lib.Model_Visualization import * + +class Basement_Handler(object): + + def __init__(self, sess, model_config, is_training): + self.sess = sess + self.model_config = model_config + self.max_grad_norm = float(model_config.get('max_grad_norm', 5.0)) + self.init_logging(is_training) + self.logger.info(model_config) + + def init_logging(self, is_training): + if is_training is not True: + base_dir = self.model_config.get('result_data') + folder_dir = self.model_config.get('category') + '_for_' + self.model_config.get('data_name') + else: + base_dir = self.model_config.get('result_model') + folder_dir = generate_folder_id(self.model_config) + log_dir = os.path.join(self.model_config.get('result_dir'), base_dir, folder_dir) + if not os.path.exists(log_dir): + os.makedirs(log_dir) + self.log_dir = log_dir + self.logger = get_logger(self.log_dir, folder_dir) + self.writer = tf.summary.FileWriter(self.log_dir) + + def trainable_parameter_info(self): + total_parameters = 0 + for variable in tf.trainable_variables(): + total_parameters += np.product([ x.value for x in variable.get_shape() ]) + + self.logger.info('Total number of trainable parameters: %d' % total_parameters) + for var in tf.global_variables(): + self.logger.debug('%s, %s' % (var.name, var.get_shape())) + + def summary_logging(self, global_step, names, values): + for name, value in zip(names, values): + summary = tf.Summary() + summary_value = summary.value.add() + summary_value.simple_value = value + summary_value.tag = name + self.writer.add_summary(summary, global_step) + + def save_model(self, saver, epoch, val_loss): + config_filename = 'config_%02d.yaml' % epoch + config = dict(self.model_config) + global_step = self.sess.run(tf.train.get_or_create_global_step()) + config['epoch'] = epoch + config['global_step'] = global_step + config['log_dir'] = self.log_dir + config['model_filename'] = saver.save(self.sess, os.path.join(self.log_dir, 'models-%.4f' % val_loss), global_step=global_step, write_meta_graph=False) + with open(os.path.join(self.log_dir, config_filename), 'w') as (f): + yaml.dump(config, f) + return config['model_filename'] + + def restore(self): + config = dict(self.model_config) + self.pred_step = config['global_step'] + model_filename = config['model_filename'] + saver = tf.train.Saver(tf.global_variables()) + saver.restore(self.sess, model_filename) + + def train_test_valid_assignment(self): + pass + + def initial_parameter(self): + pass + + def data_assignment(self): + pass + + def train(self): + pass + + def test(self): + pass \ No newline at end of file diff --git a/models/cross-dimensional-attention/tensorflow/Base_Handler.pyc b/models/cross-dimensional-attention/tensorflow/Base_Handler.pyc new file mode 100644 index 0000000..8cb1785 Binary files /dev/null and b/models/cross-dimensional-attention/tensorflow/Base_Handler.pyc differ diff --git a/models/cross-dimensional-attention/tensorflow/Base_TFModel.py b/models/cross-dimensional-attention/tensorflow/Base_TFModel.py new file mode 100644 index 0000000..16105da --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/Base_TFModel.py @@ -0,0 +1,36 @@ +# uncompyle6 version 3.7.0 +# Python bytecode 2.7 (62211) +# Decompiled from: Python 3.6.9 (default, Apr 18 2020, 01:56:04) +# [GCC 8.4.0] +# Embedded file name: /home/jiawei/Tensor_MultiDim_NYC/Model/Base_TFModel.py +# Compiled at: 2019-04-29 02:51:25 +import os, tensorflow as tf + +class Basement_TFModel(object): + """Define and Initialize the basic/necessary element of a tensorflow model """ + __module__ = __name__ + + def __init__(self, sess, config, learning_rate, is_training): + self.sess = sess + self.config = config + self.is_training = is_training + self.model_name = config.get('model_name', 'MDAnalyzer') + self.train_op = None + self.learning_rate = learning_rate + self.max_grad_norm = float(config.get('max_grad_norm', 5.0)) + self.loss = None + self.loss_func = config.get('loss_func', 'RMSE') + self.maximum_type = int(config.get('upbound', 1)) + return + + def set_lr(self, new_learning_rate): + self.learning_rate = new_learning_rate + + def save_checkpoint(self, step=None): + pass + + def load_checkpoint(self, step=None): + pass + + def initial_parameter(self): + pass \ No newline at end of file diff --git a/models/cross-dimensional-attention/tensorflow/Base_TFModel.pyc b/models/cross-dimensional-attention/tensorflow/Base_TFModel.pyc new file mode 100644 index 0000000..edfae20 Binary files /dev/null and b/models/cross-dimensional-attention/tensorflow/Base_TFModel.pyc differ diff --git a/models/cross-dimensional-attention/tensorflow/BurstLoss_Generation.ipynb b/models/cross-dimensional-attention/tensorflow/BurstLoss_Generation.ipynb new file mode 100644 index 0000000..56b31a1 --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/BurstLoss_Generation.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/dvmm-filer2/users/jiawei/anaconda2/lib/python2.7/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", + " from ._conv import register_converters as _register_converters\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import h5py" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[u'data', u'mask', u'maximum', u'position']\n", + "(186, 6624) (186, 6624) (186, 2) 0.9157041\n", + "25.585367 0.9157041 25.585367 2.992725 0.0\n" + ] + } + ], + "source": [ + "combine_file = h5py.File('data_mask_maximum.h5','r')\n", + "print combine_file.keys()\n", + "data = combine_file['data'][:]\n", + "mask = combine_file['mask'][:]\n", + "position = combine_file['position'][:]\n", + "maximum = combine_file['maximum'].value\n", + "combine_file.close()\n", + "print data.shape,mask.shape,position.shape,mask.mean()\n", + "print maximum,mask.mean(),data.max(),data.mean(),data.min()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(24290,)\n", + "(5887,)\n", + "6.350773 9.809643 0.08429594\n" + ] + } + ], + "source": [ + "mask_miss = 1.0-mask\n", + "mask_time,mask_delta = np.zeros_like(mask),np.zeros_like(mask)\n", + "mask_time[:,0] = mask_miss[:,0]\n", + "for cam in range(186):\n", + " for time in range(1,6624):\n", + " if mask_miss[cam,time] == 1:\n", + " mask_time[cam,time] = mask_time[cam,time-1]+1\n", + " mask_delta[cam,:] = mask_time[cam,:]\n", + " while(time>0):\n", + " if mask_time[cam,time] > 1:\n", + " length = int(mask_time[cam,time]-1)\n", + " mask_time[cam,time-length:time] = mask_time[cam,time]\n", + " mask_delta[cam,time-length:time] = 0\n", + " time -= length\n", + " time -= 1\n", + "delta_seq = mask_delta[mask_delta != 0]\n", + "delta_seq = np.sort(delta_seq)\n", + "delta_burst = delta_seq[delta_seq<=134]\n", + "print delta_burst.shape\n", + "delta_burst = delta_burst[delta_burst > 1]\n", + "print delta_burst.shape\n", + "print delta_burst.mean(),delta_burst.std(),mask_miss.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "missing rate: 0.9009605050086975\n" + ] + } + ], + "source": [ + "burst_mean,burst_std,mask_burst,ratio = delta_burst.mean(),delta_burst.std(),np.zeros_like(mask_miss),0.618\n", + "for cam in range(186):\n", + " time = 0\n", + " while(time<6624):\n", + " if np.random.uniform(size=1) < ratio:\n", + " length = int(np.random.normal(burst_mean, burst_std, 1))\n", + " if length > 0:\n", + " mask_burst[cam,time:time+length] = 1.0\n", + " time += length\n", + " else:\n", + " time += 1\n", + " else:\n", + " time += 1\n", + "move = mask_burst+mask_miss\n", + "move[move>1] = 1\n", + "move = 1.0-move\n", + "for i in range(6624-48*12+1):\n", + " if move[:,i:i+48*12].mean() == 0:\n", + " print i\n", + "print 'missing rate:', 1.0-move.mean()\n", + "\n", + "combine_file = h5py.File('burst_90.h5','w')\n", + "combine_file['mask'] = move\n", + "combine_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7999438 1.0 0.0\n", + "[]\n" + ] + } + ], + "source": [ + "combine_file = h5py.File('burst_20.h5','r')\n", + "move = combine_file['mask'][:]\n", + "combine_file.close()\n", + "print move.mean(),move.max(),move.min()\n", + "for i in range(6624-48*12+1):\n", + " if move[:,i:i+48*12].mean() == 0:\n", + " print i\n", + "temp = move[move!=0]\n", + "\n", + "print temp[temp!=1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/cross-dimensional-attention/tensorflow/Config.yaml b/models/cross-dimensional-attention/tensorflow/Config.yaml new file mode 100644 index 0000000..c01e7b5 --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/Config.yaml @@ -0,0 +1,55 @@ +--- +# === Configure Main function === # +category: Traffic-NYC +data_name: data_mask_maximum +mask_name: burst_70 +meas_index: 1 +model_name: MDAnalyzer +result_dir: Result +result_model: Model-Config +result_data: Validation-Result +GPU: 1 +# === Configure Model Handler === # +# Data Assignment +batch_size: 19 +period_enc: 432 +period_dec: 432 + +# Model training Controler +epoch_iter: 600 +epochs: 100 +patience: 50 +learning_rate: 0.001 +lr_decay: 0.3 +lr_decay_epoch: 20 +lr_decay_interval: 15 +loss_func: RMSE +upbound: 1 +max_grad_norm: 5.0 + +# === Configure in MDAnalyzer Model pipeline === # +# Frame: Sequence Element-wise-addition Concatenation Dimension-reduce Independent +model_structure: Sequence +num_enclayer: 7 +num_declayer: 7 +num_heads: 12 +units_IDw: 14 +units_Timew: 6 +# Query, Key and Value +time_stride_AM: 1 +time_stride_V: 1 +time_fuse_AM: 1 +time_fuse_V: 1 +units_value: 3 +units_weight: 12 +drop_rate_attention: 0.0 +# Filter SetUp +units_conv: 8 +units_pred: 8 +filter_encdec: dense +filter_pred: dense +drop_rate_forward: 0.1 +# Mask label +flag_identity: False +flag_time: False +flag_imputation: True diff --git a/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.py b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.py new file mode 100644 index 0000000..e932542 --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.py @@ -0,0 +1,166 @@ +import tensorflow as tf +import numpy as np +import yaml +import os +import h5py +import time +import sys +import math + +from Lib.Data_Processing import * +from Lib.Utility import * +from Model.MultiDim_Analyzer_Model import MultiDim_Analyzer +from Model.Base_Handler import Basement_Handler + + +class MDAnalyzer_Handler(Basement_Handler): + def __init__(self, dataset_name, model_config, sess, is_training=True): + + # Initialization of Configuration, Parameter and Datasets + super(MDAnalyzer_Handler, self).__init__(sess=sess, model_config=model_config, is_training=is_training) + self.initial_parameter() + self.data_assignment(dataset_name) + + # Define the general model and the corresponding input + self.shape_enc = (self.batch_size, self.num_identity, self.period_enc) + self.shape_dec = (self.batch_size, self.num_identity, self.period_dec) + self.input_enc = tf.placeholder(tf.float32, shape=self.shape_enc, name='encoder_inputs') + self.input_ori = tf.placeholder(tf.float32, shape=self.shape_dec, name='decoder_inputs') + self.truth_pred = tf.placeholder(tf.float32, shape=self.shape_dec, name='ground_truth') + self.truth_mask = tf.placeholder(tf.float32, shape=self.shape_dec, name='natural_missing') + self.move_mask = tf.placeholder(tf.float32, shape=self.shape_dec, name='removed_missing') + self.shared_info = tf.placeholder(tf.float32, shape=(self.num_identity, self.num_shared_feature), name='position') + with tf.variable_scope("impute", reuse=tf.AUTO_REUSE): + self.impute_segs = tf.get_variable("impute_var",shape=self.shape_enc, trainable=True, + initializer=tf.random_normal_initializer(mean=0,stddev=0.1)) + # Initialization for the model training structure. + self.learning_rate = tf.get_variable('learning_rate', shape=(), initializer=tf.constant_initializer(self.lr_init),trainable=False) + self.lr_new = tf.placeholder(tf.float32, shape=(), name='lr_new') + self.lr_update = tf.assign(self.learning_rate, self.lr_new, name='lr_update') + + self.train_test_valid_assignment() + self.trainable_parameter_info() + self.saver = tf.train.Saver(tf.global_variables()) + + def initial_parameter(self): + + # Configuration Set + config = self.model_config + + # Model Input Initialization + self.batch_size = int(config.get('batch_size',1)) + self.period_enc = int(config.get('period_enc',12)) + self.period_dec = int(config.get('period_dec',12)) + + # Initialization for Training Controler + self.epochs = int(config.get('epochs',100)) + self.epoch_iter = int(config.get('epoch_iter',5)) + self.patience = int(config.get('patience',30)) + self.lr_init = float(config.get('learning_rate',0.001)) + self.lr_decay = float(config.get('lr_decay',0.1)) + self.lr_decay_epoch = int(config.get('lr_decay_epoch',20)) + self.lr_decay_interval = int(config.get('lr_decay_interval',10)) + + def data_assignment(self,dataset_name): + model_config = self.model_config + set_whole, self.node_pos, self.maximum = Data_Division(dataset_name) + + # Pre-calculation for model training + self.scalar = limit_scalar(set_whole) + self.set_whole,disp_whole = self.scalar.transform(set_whole) + self.num_identity,self.num_shared_feature = self.node_pos.shape[0],self.node_pos.shape[1] + self.whole_segs = (set_whole[0].shape[-1]-self.period_enc)/2+1 + self.whole_size = int(self.whole_segs/self.batch_size) + + # Display the data structure of Training/Testing/Validation Dataset + print 'Available Segments[batches] %d[%d] Shape of data/mask piece %s and min-mean-max is %.2f-%.2f-%.2f' % ( + self.whole_segs,self.whole_size,set_whole[0].shape, disp_whole[0], disp_whole[1], disp_whole[2]) + print 'Measurement maximum(average,std) are %4.4f(%4.4f,%4.4f)' % (self.maximum,self.scalar.mean,self.scalar.std) + + # Data Generator + self.gen_whole = Data_Generator(self.set_whole, set_whole[0], self.whole_segs, self.batch_size, self.period_enc, is_training=True) + + def train_test_valid_assignment(self):#, is_training = True, reuse = False + + # the original mask use 1 to indicate the missing point, and the inverse has been done during the data_division function + value_sets = ( + tf.expand_dims(self.input_enc, -1), # the input value of encoder (current data with randomly removed) + tf.expand_dims(self.input_ori, -1), # the input value of decoder (future/current data with randomly removed) + tf.expand_dims(self.truth_pred,-1), # the groundthuth of the future/current prediction + tf.expand_dims(self.truth_mask,-1), # the label of NATURAL missing data 0 -- missing, 1 -- available + tf.expand_dims(self.move_mask, -1), # the label of remove missing data 0 -- missing, 1 -- available + self.shared_info, # the shared information of the nodes (position), normalized value [0,1] + self.scalar # (Class Function) Rescale the the input and the output + ) + with tf.name_scope('Train'): + with tf.variable_scope('MultiDim_Analyzer', reuse=False): + self.MD_Analyzer_train = MultiDim_Analyzer(value_sets, self.learning_rate, self.sess, self.model_config, is_training=True) + + def train(self): + self.sess.run(tf.global_variables_initializer()) + print ('Training Started') + min_impute_metric = float('inf') + epoch_cnt,wait = 0,0 + + start_time = time.time() + while epoch_cnt <= self.epochs: + + # Training Preparation: Learning rate pre=setting, Model Interface summary. + + cur_lr = self.calculate_scheduled_lr(epoch_cnt) + whole_fetches = {'global_step': tf.train.get_or_create_global_step(), + 'train_op': self.MD_Analyzer_train.train_op, + 'preds': self.MD_Analyzer_train.orig_impute, + 'metric': self.MD_Analyzer_train.orig_metric, + 'loss': self.MD_Analyzer_train.loss} + Results = {"loss":[],"imputed":[],"metric":[],"ground":[],"mask_compare":[]} + # Framework and Visualization SetUp for Training + for trained_batch in range(0,self.whole_size): + (curdata,curmask,curmove,curdata_orig) = self.gen_whole.next() + feed_dict_whole = {self.input_enc: curdata*curmove, + self.input_ori: curdata_orig, + self.truth_pred: curdata, + self.truth_mask: curmask, + self.move_mask: curmove, + self.shared_info: self.node_pos} + whole_output = self.sess.run(whole_fetches,feed_dict=feed_dict_whole) + message = "Epoch [%3d/%3d] [%d/%d] lr: %.4f, loss: %.8f" % ( + epoch_cnt, self.epochs, trained_batch, self.whole_size, cur_lr, whole_output["loss"]) + if trained_batch % 50 == 0: + print message + + + Results["metric"].append(whole_output['metric']) + Results["loss"].append(whole_output['loss']) + Results["imputed"].append(whole_output['preds']) + Results["ground"].append(curdata_orig) + Results["mask_compare"].append(curmask-curmove) + global_step = whole_output['global_step'] + + loss,metric_seg = np.mean(Results["loss"]),np.mean(Results["metric"],axis=0) + if metric_seg[0] <= min_impute_metric: + min_impute_metric = metric_seg[0] + metrics = calculate_metrics_np(Results["imputed"],Results["ground"],Results["mask_compare"]) + + # Information Logging for Model Training and Validation (Maybe for Curve Plotting) + summary_format = ['loss/train_loss','metric/mse_segmin','metric/rmse','metric/mae','metric/mape','metric/mre'] + summary_data = [loss,min_impute_metric,metrics[1],metrics[2],metrics[3],metrics[4]] + self.summary_logging(global_step, summary_format, summary_data) + # Message Summary of each epoch (For info.log logging) + message = 'Epoch [%3d/%3d] loss: %.4f(%.4f), Orig MSE/RMSE/MAE/MAPE/MRE %s' % ( + epoch_cnt, self.epochs, np.mean(Results["loss"]),min_impute_metric,metrics) + self.logger.info(message) + epoch_cnt += 1 + print '%ds'%(time.time()-start_time) + + def calculate_scheduled_lr(self, epoch, min_lr=1e-6): + decay_factor = int(math.ceil((epoch - self.lr_decay_epoch) / float(self.lr_decay_interval))) + new_lr = self.lr_init * self.lr_decay ** max(0, decay_factor) + new_lr = max(min_lr, new_lr) + + self.logger.info('Current learning rate to: %.6f' % new_lr) + sys.stdout.flush() + + self.sess.run(self.lr_update, feed_dict={self.lr_new: new_lr}) + self.MD_Analyzer_train.set_lr(self.learning_rate) + return new_lr \ No newline at end of file diff --git a/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.pyc b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.pyc new file mode 100644 index 0000000..d1e6d7c Binary files /dev/null and b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Handler.pyc differ diff --git a/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.py b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.py new file mode 100644 index 0000000..8a07c6c --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.py @@ -0,0 +1,362 @@ +import tensorflow as tf +import numpy as np +import math +import os +import json + +from Lib.Utility import * +from Model.Base_TFModel import Basement_TFModel + +class MultiDim_Analyzer(Basement_TFModel): + + def __init__(self, value_sets, init_learning_rate, sess, config, is_training=True, *args, **kwargs): + + super(MultiDim_Analyzer, self).__init__(sess=sess, config=config, learning_rate=init_learning_rate,is_training=is_training) + ''' + Model input Explanation & Reminder: + enc_input: the masked input of encoder, the dimension of time may varies with dec_input by [data generation and assignment in handler] + dec_input: the masked input of encoder, the dimension of time may varies with enc_input by [data generation and assignment in handler] + truth_pred: used in loss calculation: have identical dimension with decoder input, same(shifted) value for completion(prediction) + truth_mask: used in the calculation of metric like MSE and relabel the model output for comparison (Mask of Natural Loss) + shared_info: auxiliary information for encoder/decoder input encoding for identity dimension. BTW, encoding in time dimension is similar to NLP + scalar: the maximum value of each measurement, used for metric calculation + + For data in the value_sets (Data Unzip): + Size for model input/output: (batch_size, num_identity, num_measurement, period_enc/dec, 1) + Size for auxiliary shared_info: (batch_size, num_shared_feature) + ''' + (enc_input, ori_input, truth_pred, truth_mask, move_mask, shared_info, scalar) = value_sets + self.num_identity = enc_input.get_shape().as_list()[1] + + # Initialization of the model hyperparameter, enc-dec structure, evaluation metric & Optimizier + self.initial_parameter() + self.model_input,self.model_output = self.encdec_handler(enc_input, shared_info, truth_pred, truth_mask, move_mask) + self.metric_opt(self.model_output, ori_input, truth_pred, truth_mask, move_mask, scalar) + + def encdec_handler(self, enc_input, shared_info, truth_pred, truth_mask, move_mask): + # for dimention introduction of the input: refer to class initialization + + # Options for Model Structure + # Independent -------------- 3 Enc-Dec model are used to learn the relationship for each dimension repectively + # Sequence ----------------- Following an arbitrary order to do the multiplication on the value vector + # Element-wise-addition ---- Multiplication on the same value vector and then do the element-wise addition (average) + # Concatenation ------------ Multiplication on the same value vector and then do the concatenation and matmul (generalized ew-addition) + # Dimension-reduce --------- Expand the input 3D value to a 1D vector and then calculate the huge AM. (limitation of memory) + + # The encode is available for Identity&Measurement and Time dimension + (shared_encoder,shared_decoder,time_encoder,time_decoder) = self.auxiliary_encode(shared_info) + if self.flag_casuality == True: + mask_casuality = self.casual_mask() + else: + mask_casuality = None + if self.flag_imputation == True: + self.mask_imputation = self.impute_mask() + else: + self.mask_imputation = None + if self.flag_time == True: + enc_input = enc_input + time_encoder + if self.flag_identity == True: + enc_input = enc_input + shared_encoder + + with tf.variable_scope('layer_init'): + enc_init = self.multihead_attention(enc_input, self.attention_unit, self.model_structure) + enc_init = self.feed_forward_layer(enc_init, self.conv_unit, self.filter_encdec) + enc_init = enc_input + tf.multiply(tf.constant(1.0,dtype=tf.float32)-move_mask,enc_init) + topenc = self.encoder(enc_input, self.model_structure) + return enc_init,topenc + + def encoder(self,enclayer_init,model_structure): + enclayer_in = enclayer_init + with tf.variable_scope('Encoder'): + for cnt_enclayer in range(0,self.num_enclayer): + with tf.variable_scope('layer_%d'%(cnt_enclayer)): + enclayer_in = self.layer_norm(enclayer_in + self.multihead_attention( + enclayer_in, self.attention_unit, model_structure), 'norm_1') + enclayer_in = self.layer_norm(enclayer_in + self.feed_forward_layer( + enclayer_in, self.conv_unit, self.filter_encdec), 'norm_2')# + with tf.variable_scope('Enc_pred1'): + enclayer_in = self.multihead_attention(enclayer_in, self.attention_unit, model_structure) + enclayer_in = self.feed_forward_layer(enclayer_in, self.conv_unit, self.filter_encdec) + return enclayer_in + + def decoder(self, declayer_init, model_structure, encoder_top, mask_casuality=None): + declayer_in = declayer_init + with tf.variable_scope('Decoder'): + with tf.variable_scope('layer_0'): + declayer_in = self.layer_norm(declayer_in + self.multihead_attention( + declayer_in, self.attention_unit, model_structure, mask=mask_casuality), 'norm_1') + (attention_out,KVtop_share) = self.multihead_attention( + declayer_in, self.attention_unit, model_structure, top_encod=encoder_top, scope='enc-dec-attention') + declayer_in = self.layer_norm(declayer_in + attention_out, 'norm_2') + declayer_in = self.layer_norm(declayer_in + self.feed_forward_layer( + declayer_in, self.conv_unit, self.filter_encdec), 'norm_3') + for cnt_declayer in range(1,self.num_declayer): + with tf.variable_scope('layer_%d'%(cnt_declayer)): + declayer_in = self.layer_norm(declayer_in + self.multihead_attention( + declayer_in, self.attention_unit, model_structure, mask=mask_casuality), 'norm_1') + declayer_in = self.layer_norm(declayer_in + self.multihead_attention( + declayer_in, self.attention_unit, model_structure, top_encod=encoder_top, cache=KVtop_share, + scope='enc-dec-attention'), 'norm_2') + declayer_in = self.layer_norm(declayer_in + self.feed_forward_layer( + declayer_in, self.conv_unit, self.filter_encdec), 'norm_3') + with tf.variable_scope('Dec_pred_1'): + declayer_in = self.multihead_attention(declayer_in, self.attention_unit, model_structure) + declayer_in = self.feed_forward_layer(declayer_in, self.conv_unit, self.filter_encdec) + + return declayer_in + + def metric_opt(self, model_output, truth_orig, truth_pred, truth_mask, move_mask, scalar): + + loss_mask = move_mask + global_step = tf.train.get_or_create_global_step() + avail_output = tf.multiply(model_output, loss_mask) + avail_truth = tf.multiply(truth_pred, loss_mask) + + if self.loss_func == 'MSE': + self.loss = loss_mse(avail_output, avail_truth, loss_mask) + elif self.loss_func == 'RMSE': + self.loss = loss_rmse(avail_output, avail_truth, loss_mask) + elif self.loss_func == 'MAE': + self.loss = loss_mae(avail_output, avail_truth, loss_mask) + else: + self.loss = loss_rmse(avail_output, avail_truth, loss_mask)+loss_mae(avail_output, avail_truth, loss_mask) + + if self.is_training: + optimizer = tf.train.AdamOptimizer(self.learning_rate) + tvars = tf.trainable_variables() + grads = tf.gradients(self.loss, tvars) + grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) + self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step, name='train_op') + self.info_merge = tf.summary.merge_all() + + orig_preds = scalar.TFinverse_transform(model_output) + orig_truth = tf.multiply(truth_orig,truth_mask-move_mask) + self.orig_impute = tf.multiply(truth_orig,move_mask) + tf.multiply(orig_preds, tf.constant(1.0,dtype=tf.float32)-move_mask) + self.orig_metric = calculate_metrics(tf.multiply(orig_preds,truth_mask-move_mask), orig_truth, truth_mask-move_mask) + + def multihead_attention(self, att_input, att_unit, model_structure, top_encod=None, cache=None, mask=None, scope='self-attention'): + """ + att_input: the input to be calculated in this module with size of [batch, #identity, #measurement, time, 1] + att_unit: the hyperparameter for the dimention of Q/K and V + top_encod: the output from the top encoder layer [batch_size,#identity, #measurement, time, 1] + mask: mask the casuality of the self-attention layer, [batch, time, time] or [batch, #id*#meas*time, #id*#meas*time] + + 3D convolution is applied to realize the unit expansion. For the convenience of application, we have the following index mapping: + [batch, in_depth, in_height, in_width, in_channels] = [batch_size, num_identity, num_measurement, length_time, 1] + """ + # Initialization for some necessary item + (value_units, Iunits, Tunits) = att_unit + KVtop_cache = None + # Since the value of num_measurement and period are small and may equal to each other by coincidence + # We use the dimention of num_identity + V_filters, V_kernal, V_stride = value_units*self.num_heads, (1, self.V_timfuse), (1, self.V_timjump) + batch,ids,time = att_input.get_shape().as_list()[:3] + + with tf.variable_scope(scope): + if top_encod is None or cache is None: + if top_encod is None: + top_encod = att_input + else: + if cache is None: + KVtop_cache = {} + # Linear projection (unit expansion) for multihead-attention dimension: + Q_iden = tf.layers.dense(tf.reshape(att_input,[batch,ids,-1]), self.num_heads*Iunits, + use_bias=False, name='Q_ID') + K_iden = tf.layers.dense(tf.reshape(top_encod,[batch,ids,-1]), self.num_heads*Iunits, + use_bias=False, name='K_ID') + Q_time = tf.layers.dense(tf.reshape(tf.transpose(att_input,[0,2,1,3]),[batch,time,-1]), self.num_heads*Tunits, + use_bias=False, name='Q_Time') + K_time = tf.layers.dense(tf.reshape(tf.transpose(top_encod,[0,2,1,3]),[batch,time,-1]), self.num_heads*Tunits, + use_bias=False, name='K_Time') + V = tf.layers.conv2d(inputs=top_encod, filters=V_filters, kernel_size=V_kernal, strides=V_stride, + padding="same", data_format="channels_last", name='V') + if KVtop_cache is not None: + KVtop_cache = {'share_Kid':K_id, 'share_Ktime':K_time, 'share_V':V} + else: + Q_iden = tf.layers.dense(tf.reshape(att_input,[batch,ids,-1]), self.num_heads*Iunits, + use_bias=False, name='Q_ID') + Q_time = tf.layers.dense(tf.reshape(tf.transpose(att_input,[0,2,1,3]),[batch,time,-1]), self.num_heads*Tunits, + use_bias=False, name='Q_Time') + K_id,K_time,V = cache['share_Kid'],cache['share_Ktime'],cache['share_V'] + + # Split the matrix to multiple heads and then concatenate to build a larger batch size: + # [self.batch_size*self.num_heads, self.X, self.X] + Qhb_id = tf.concat(tf.split(Q_iden, self.num_heads, axis=2), axis=0) + Qhb_time = tf.concat(tf.split(Q_time, self.num_heads, axis=2), axis=0) + Khb_id = tf.concat(tf.split(K_iden, self.num_heads, axis=2), axis=0) + Khb_time = tf.concat(tf.split(K_time, self.num_heads, axis=2), axis=0) + # [self.batch_size*self.num_heads, self.num_identity, self.num_measurement, self.length_time, 'hidden-units'] + Q_headbatch = (Qhb_id,Qhb_time) + K_headbatch = (Khb_id,Khb_time) + V_headbatch = tf.concat(tf.split(V, self.num_heads, axis=3), axis=0) + + if mask is not None: + mask_recur = tf.tile(mask, [self.num_heads, 1, 1]) + else: + mask_recur = None + + out = self.softmax_combination(Q_headbatch, K_headbatch, V_headbatch, model_structure, att_unit, mask_recur) + + # Merge the multi-head back to the original shape + # [batch_size, self.num_identity, self.num_measurement, self.length_time, 'hidden-units'*self.num_heads] + out = tf.concat(tf.split(out, self.num_heads, axis=0), axis=3) # + out = tf.layers.dense(out, 1, name='multihead_fuse') + out = tf.layers.dropout(out, rate=self.attdrop_rate, training=self.is_training) + + if KVtop_cache is None: + return out + else: + return (out,KVtop_cache) + + def feed_forward_layer(self, info_attention, num_hunits, filter_type='dense'): + ''' + forward_type: + "dense" indicates dense layer, + "graph" indicates graph based FIR filter (graph convolution), + "attention" indicates applying the attention algorithm + "conv" indicates the shared convolution kernal is applied instead of a big weight matrix + self.ffndrop_rate may be considered later 03122019 + ''' + channel = info_attention.get_shape().as_list()[-1] + if filter_type == 'dense': + ffn_dense = tf.layers.dense(info_attention, num_hunits, use_bias=True, activation=tf.nn.relu, name=filter_type+'1') + ffn_dense = tf.layers.dense(info_attention, num_hunits, use_bias=True, activation=None, name=filter_type+'2') + return tf.layers.dense(ffn_dense, channel, use_bias=True, activation=None, name=filter_type+'3') + elif filter_type == 'graph': + raise NotImplementedError + elif filter_type == 'attention': + raise NotImplementedError + elif filter_type == 'conv': + raise NotImplementedError + + def layer_norm(self, norm_input, name_stage): + norm_step = tf.contrib.layers.layer_norm(tf.transpose(tf.squeeze(norm_input),perm=[0,2,1]), + begin_norm_axis=2, center=True, scale=True,scope=name_stage) + return tf.expand_dims(tf.transpose(norm_step,perm=[0,2,1]),-1) + + def softmax_combination(self, Q, K, V, model_structure, att_unit, mask=None): + '''mask is applied before the softmax layer, no dropout is applied, ''' + value_units,segs = V.get_shape().as_list()[-1],V.get_shape().as_list()[0] + ids,time = Q[0].get_shape().as_list()[1],Q[1].get_shape().as_list()[1] + (value_units, Iunits, Tunits) = att_unit + + (Q_I,Q_T) = Q + (K_I,K_T) = K + + # Check the dimension consistency of the combined matrix + assert Q_I.get_shape().as_list()[1:] == K_I.get_shape().as_list()[1:] + assert Q_T.get_shape().as_list()[1:] == K_T.get_shape().as_list()[1:] + assert Q_I.get_shape().as_list()[0] == Q_T.get_shape().as_list()[0] + assert K_I.get_shape().as_list()[0] == K_T.get_shape().as_list()[0] + + # Build the Attention Map + AM_Identity = tf.matmul(Q_I, tf.transpose(K_I, [0, 2, 1])) / tf.sqrt(tf.cast(Iunits, tf.float32)) + AM_Time = tf.matmul(Q_T, tf.transpose(K_T, [0, 2, 1])) / tf.sqrt(tf.cast(Tunits, tf.float32)) + if mask is not None: + AM_Time = tf.multiply(AM_Time,mask) + tf.constant(-np.inf)*(tf.constant(1.0)-mask) + if self.mask_imputation is not None: + (iden_mask, time_mask) = self.mask_imputation + #AM_Identity = tf.multiply(AM_Identity,iden_mask) + tf.constant(-1.0e9)*(tf.constant(1.0)-iden_mask) + AM_Time = tf.multiply(AM_Time,time_mask) + tf.constant(-1.0e9)*(tf.constant(1.0)-time_mask) + AM_Identity = tf.nn.softmax(AM_Identity, 2) + AM_Time = tf.nn.softmax(AM_Time, 2) + + shape_id = [segs, ids, time, value_units] + shape_time = [segs, time, ids, value_units] + + if model_structure == 'Sequence': + Out_Id = tf.reshape(tf.matmul(AM_Identity, tf.reshape(V,[segs, ids, -1])), shape_id) + Out_Id = tf.transpose(Out_Id,perm=[0,2,1,3]) + Out_Id_Time = tf.reshape(tf.matmul(AM_Time, tf.reshape(Out_Id, [segs,time,-1])), shape_time) + return tf.transpose(Out_Id_Time,perm=[0,2,1,3]) + else: + V_id,V_time = V,tf.transpose(V,perm=[0,2,1,3]) + Out_Identity = tf.reshape(tf.matmul(AM_Identity, tf.reshape(V_id,[segs, ids, -1])), shape_id) + Out_Time = tf.reshape(tf.matmul(AM_Time, tf.reshape(V_time,[segs, time, -1])), shape_time) + + Out_Time = tf.transpose(Out_Time,perm=[0,2,1,3]) + if model_structure == 'Element-wise-addition': + return tf.divide(tf.add(Out_Identity,Out_Time),tf.constant(2.0)) + elif model_structure == 'Concatenation': + Attention_output = tf.concat([Out_Identity, Out_Time], 3) + return tf.layers.dense(Attention_output, value_units, use_bias=False) + else: + raise UnavailableStructureMode + + def casual_mask(self): + ''' + This function is only applied in the self-attention layer of decoder. + The lower triangular matrix is used to indicate the available reference of all position in each calculation + Key Idea: Only the previous position is applied to predict the future + ''' + batch_size,period = self.batch_size,self.period_dec + casual_unit = np.tril(np.ones((period, period))) + casual_tensor = tf.convert_to_tensor(casual_unit, dtype=tf.float32) + return tf.tile(tf.expand_dims(casual_tensor, 0), [batch_size, 1, 1]) + + def impute_mask(self): + batch_size = self.batch_size + iden_unit = 1.0-np.identity(self.num_identity) + time_unit = 1.0-np.identity(self.period_dec) + iden_tensor = tf.tile(tf.expand_dims(tf.convert_to_tensor(iden_unit, dtype=tf.float32), 0), [self.num_heads*batch_size, 1, 1]) + time_tensor = tf.tile(tf.expand_dims(tf.convert_to_tensor(time_unit, dtype=tf.float32), 0), [self.num_heads*batch_size, 1, 1]) + return (iden_tensor,time_tensor) + + def initial_parameter(self): + + config = self.config + # Parameter Initialization of Data Assignment + self.batch_size = int(config.get('batch_size',1)) + self.period_enc = int(config.get('period_enc',12)) + self.period_dec = int(config.get('period_dec',12)) + + # Parameter Initialization of Model Framework + self.num_heads = int(config.get('num_heads',8)) + self.num_enclayer = int(config.get('num_enclayer',5)) + self.num_declayer = int(config.get('num_declayer',5)) + self.model_structure = self.config.get('model_structure') + + # Parameter Initialization of Attention (Q K V) + self.AM_timjump = int(config.get('time_stride_AM',1)) + self.V_timjump = int(config.get('time_stride_V',1)) + self.AM_timfuse = int(config.get('time_fuse_AM',1)) + self.V_timfuse = int(config.get('time_fuse_V',1)) + vunits,Iunits,Tunits = int(config.get('units_value',6)),int(config.get('units_IDw',14)),int(config.get('units_Timew',6)) + self.attention_unit = (vunits, Iunits, Tunits) + + # Parameter Initialization of Filter (Enc-Dec, Prediction) + self.filter_encdec = config.get('filter_encdec','dense') + self.conv_unit = int(config.get('units_conv',4)) + self.attdrop_rate = float(config.get('drop_rate_attention',0.0)) + self.ffndrop_rate = float(config.get('drop_rate_forward',0.1)) + self.filter_pred = config.get('filter_pred','dense') + self.pred_unit = int(config.get('units_pred',8)) + + # label of mask + self.flag_identity = config.get('flag_identity',False) + self.flag_time = config.get('flag_time',False) + self.flag_casuality = config.get('flag_casuality',False) + self.flag_imputation = config.get('flag_imputation',False) + + def auxiliary_encode(self,shared_info): + # The concatenation is not applicable in this part since all the attention of all three dimension need to be learned. + # Expanding each dimention will not make sense for our model. + # Concatenation with the feature dimention (expanded as 1) is equivalent with the element-wise addition. + with tf.variable_scope('shared_feature'): + shared_encoder = tf.layers.dense(tf.expand_dims(shared_info,0), self.period_enc, + use_bias=False, activation=None, name='encoder') + shared_encoder = tf.reshape(shared_encoder, [1, self.num_identity, self.period_enc, 1]) + shared_encoder = tf.tile(shared_encoder, [self.batch_size, 1, 1, 1]) + shared_decoder = tf.layers.dense(tf.expand_dims(shared_info,0), self.period_dec, + use_bias=False, activation=None, name='decoder') + shared_decoder = tf.reshape(shared_decoder, [1, self.num_identity, self.period_dec, 1]) + shared_decoder = tf.tile(shared_decoder, [self.batch_size, 1, 1, 1]) + + denom = tf.constant(1000.0) + phase_enc = tf.linspace(0.0,self.period_enc-1.0,self.period_enc)*tf.constant(math.pi/180.0)/denom + phase_dec = tf.linspace(0.0,self.period_dec-1.0,self.period_dec)*tf.constant(math.pi/180.0)/denom + sin_enc,sin_dec = tf.expand_dims(tf.sin(phase_enc),0),tf.expand_dims(tf.sin(phase_dec),0) + + time_encoder = tf.expand_dims(tf.tile(tf.expand_dims(sin_enc,0),[self.batch_size,self.num_identity,1]),-1) + time_decoder = tf.expand_dims(tf.tile(tf.expand_dims(sin_dec,0),[self.batch_size,self.num_identity,1]),-1) + return (shared_encoder,shared_decoder,time_encoder,time_decoder) \ No newline at end of file diff --git a/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.pyc b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.pyc new file mode 100644 index 0000000..277bcaa Binary files /dev/null and b/models/cross-dimensional-attention/tensorflow/MultiDim_Analyzer_Model.pyc differ diff --git a/models/cross-dimensional-attention/tensorflow/__init__.py b/models/cross-dimensional-attention/tensorflow/__init__.py new file mode 100644 index 0000000..9615bf0 --- /dev/null +++ b/models/cross-dimensional-attention/tensorflow/__init__.py @@ -0,0 +1,7 @@ +# uncompyle6 version 3.7.1 +# Python bytecode 2.7 (62211) +# Decompiled from: Python 3.6.9 (default, Apr 18 2020, 01:56:04) +# [GCC 8.4.0] +# Embedded file name: /home/jiawei/Tensor_MultiDim_NYC/Model/__init__.py +# Compiled at: 2019-04-29 08:21:25 +pass \ No newline at end of file diff --git a/models/cross-dimensional-attention/tensorflow/__init__.pyc b/models/cross-dimensional-attention/tensorflow/__init__.pyc new file mode 100644 index 0000000..9d4580d Binary files /dev/null and b/models/cross-dimensional-attention/tensorflow/__init__.pyc differ