diff --git a/tensorflow/BERT/.gitignore b/tensorflow/BERT/.gitignore new file mode 100644 index 0000000..ca0b8b8 --- /dev/null +++ b/tensorflow/BERT/.gitignore @@ -0,0 +1,3 @@ +/models/ +/data/ +/output/ diff --git a/tensorflow/BERT/Makefile b/tensorflow/BERT/Makefile new file mode 100755 index 0000000..bf7af5d --- /dev/null +++ b/tensorflow/BERT/Makefile @@ -0,0 +1,60 @@ +# BERT sample for TensorFlow + +ARCH_LIBDIR ?= /lib/$(shell $(CC) -dumpmachine) +SGX_SIGNER_KEY ?= ../../../Pal/src/host/Linux-SGX/signer/enclave-key.pem + +ifeq ($(DEBUG),1) +GRAMINE_LOG_LEVEL = debug +else +GRAMINE_LOG_LEVEL = error +endif + +.PHONY: all +all: python.manifest +ifeq ($(SGX),1) +all: python.manifest.sgx python.sig python.token +endif + +BERT_DATASET = https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip +SQUAAD_DATASET = https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json +CHECKPOINTS = https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip +BERT_FP32_MODEL = https://storage.googleapis.com/intel-optimized-tensorflow/models/v2_4_0/fp32_bert_squad.pb + +collateral: + apt install unzip + test -d models || git clone https://github.com/IntelAI/models.git + mkdir -p data + test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget $(BERT_DATASET) -P data/ + test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data + test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget $(SQUAAD_DATASET) -P data/wwm_uncased_L-24_H-1024_A-16 + test -f data/bert_large_checkpoints.zip || wget $(CHECKPOINTS) -P data/ + test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data + test -f data/fp32_bert_squad.pb || wget $(BERT_FP32_MODEL) -P data/ + +python.manifest: python.manifest.template collateral + gramine-manifest \ + -Dlog_level=$(GRAMINE_LOG_LEVEL) \ + -Darch_libdir=$(ARCH_LIBDIR) \ + -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ + -Dpythondistpath=$(PYTHONDISTPATH) \ + $< >$@ + +python.manifest.sgx: python.manifest + @test -s $(SGX_SIGNER_KEY) || \ + { echo "SGX signer private key was not found, please specify SGX_SIGNER_KEY!"; exit 1; } + gramine-sgx-sign \ + --key $(SGX_SIGNER_KEY) \ + --manifest $< --output $@ + +python.sig: python.manifest.sgx + +python.token: python.sig + gramine-sgx-get-token --output $@ --sig $< + +.PHONY: clean +clean: + $(RM) *.manifest *.manifest.sgx *.token *.sig + +.PHONY: distclean +distclean: clean + $(RM) -r models/ data/ diff --git a/tensorflow/BERT/python.manifest.template b/tensorflow/BERT/python.manifest.template new file mode 100755 index 0000000..017305b --- /dev/null +++ b/tensorflow/BERT/python.manifest.template @@ -0,0 +1,68 @@ +libos.entrypoint = "{{ entrypoint }}" +loader.entrypoint = "file:{{ gramine.libos }}" + +loader.log_level = "{{ log_level }}" + +loader.insecure__use_cmdline_argv = true +loader.insecure__use_host_env = true +loader.insecure__disable_aslr = true + +loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" + +loader.pal_internal_mem_size = "512M" + +fs.mount.lib.type = "chroot" +fs.mount.lib.path = "/lib" +fs.mount.lib.uri = "file:{{ gramine.runtimedir() }}" + +fs.mount.lib2.type = "chroot" +fs.mount.lib2.path = "{{ arch_libdir }}" +fs.mount.lib2.uri = "file:{{ arch_libdir }}" + +fs.mount.usr.type = "chroot" +fs.mount.usr.path = "/usr" +fs.mount.usr.uri = "file:/usr" + +fs.mount.pyhome.type = "chroot" +fs.mount.pyhome.path = "{{ python.stdlib }}" +fs.mount.pyhome.uri = "file:{{ python.stdlib }}" + +fs.mount.pydisthome.type = "chroot" +fs.mount.pydisthome.path = "{{ python.distlib }}" +fs.mount.pydisthome.uri = "file:{{ python.distlib }}" + +fs.mount.pydistpath.type = "chroot" +fs.mount.pydistpath.path = "{{ pythondistpath }}" +fs.mount.pydistpath.uri = "file:{{ pythondistpath }}" + +fs.mount.tmp.type = "chroot" +fs.mount.tmp.path = "/tmp" +fs.mount.tmp.uri = "file:/tmp" + +fs.mount.etc.type = "chroot" +fs.mount.etc.path = "/etc" +fs.mount.etc.uri = "file:/etc" + +sgx.enclave_size = "32G" +sgx.thread_num = 256 +sgx.preheat_enclave = true +sgx.nonpie_binary = true + +sgx.trusted_files = [ + "file:{{ gramine.runtimedir() }}/", + "file:{{ arch_libdir }}/", + "file:/usr/{{ arch_libdir }}/", + "file:{{ gramine.libos }}", + "file:{{ entrypoint }}", + "file:{{ python.stdlib }}/", + "file:{{ python.distlib }}/", + "file:{{ pythondistpath }}/", +] + +sgx.allowed_files = [ + "file:/tmp/", + "file:/etc/", + "file:output/", + "file:models/", + "file:data/", +] diff --git a/tensorflow/README.md b/tensorflow/README.md new file mode 100755 index 0000000..78080f5 --- /dev/null +++ b/tensorflow/README.md @@ -0,0 +1,137 @@ +## Inference on TensorFlow BERT and ResNet50 models +This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 +sample workloads on Gramine. Specifically, both these examples use pre-trained models to run +inference. + +### Bidirectional Encoder Representations from Transformers (BERT): +BERT is a method of pre-training language representations and then use that trained model for +downstream NLP tasks like 'question answering'. BERT is an unsupervised, deeply bidirectional system +for pre-training NLP. +In this BERT sample, we use **BERT-Large, Uncased (Whole Word Masking)** model and perform int8 +inference. More details about BERT can be found at https://github.com/google-research/bert. + +### Residual Network (ResNet): +ResNet50 is a convolutional neural network that is 50 layers deep. +In this ResNet50 (v1.5) sample, we use a pre-trained model and perform int8 inference. +More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5. + +## Pre-requisites +- Upgrade pip/pip3. +- Install TensorFlow using ``pip install intel-tensorflow-avx512==2.4.0``. + +## Build BERT or ResNet50 samples +- To build BERT sample, do ``cd BERT``. To build ResNet50 sample, do ``cd ResNet50``. +- To clean the sample, do ``make clean`` +- To clean and remove downloaded models and datasets, do ``make distclean`` +- To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/`` +- To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` +- Typically, ``path_to_python_dist_packages`` is ``/usr/local/lib/python3.6/dist-packages``, but can +change based on python's installation directory. + +**WARNING:** Building BERT sample downloads about 5GB of data. + +## Run inference on BERT model +- To run int8 inference on ``gramine-sgx`` (SGX version): +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 gramine-sgx \ +./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ +--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ +--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ +--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ +--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ +--precision=int8 \ +--output_dir=output/bert-squad-output \ +--predict_batch_size=32 \ +--experimental_gelu=True \ +--optimized_softmax=True \ +--input_graph=data/asymmetric_per_channel_bert_int8.pb \ +--do_predict=True --mode=benchmark \ +--inter_op_parallelism_threads=1 \ +--intra_op_parallelism_threads=36 +``` +- To run int8 inference on ``gramine-direct`` (non-SGX version), replace ``gramine-sgx`` with +``gramine-direct`` in the above command. +- To run int8 inference natively (outside Gramine), replace ``gramine-sgx ./python`` with +``python3`` in the above command. + +## Run inference on ResNet50 model +- To run inference on ``gramine-sgx`` (SGX version): +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 gramine-sgx \ +./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ +--input-graph=resnet50v1_5_int8_pretrained_model.pb \ +--num-inter-threads=1 \ +--num-intra-threads=36 \ +--batch-size=32 \ +--warmup-steps=50 \ +--steps=500 +``` +- To run inference on ``gramine-direct`` (non-SGX version), replace ``gramine-sgx`` with +``gramine-direct`` in the above command. +- To run inference natively (outside Gramine), replace ``gramine-sgx ./python`` with +``python3`` in the above command. + +## Notes on optimal performance +Above commands are for a 36 core system. Please set the following options accordingly for optimal +performance: + +- Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X`, + `intra_op_parallelism_threads=X` for BERT and `num_intra_threads=X` for ResNet50. +- Specify the whole range of cores available on one of the sockets in `taskset`. +- If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` +- If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` +- Note that `OMP_NUM_THREADS` sets the maximum number of threads to + use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads + to physical processing units. +- The options `batch-size`, `warmup-steps` and `steps` can be varied for ResNet50 sample. +- To get the number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. + +## Performance considerations +### CPU frequency scaling + +Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency +to achieve best performance or to save power based on the requirement. To achieve the best +performance, please set the CPU frequency scaling governor to `performance` mode. + +```bash +for ((i=0; i<$(nproc); i++)); do + echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; +done +``` + +### Manifest options for performance + +- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to +Gramine-SGX startup (before the workload starts executing). To use the preheat option, make sure +that `sgx.preheat_enclave = true` is added to the manifest template. + +### Memory allocator libraries + +TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help +improve performance significantly based on the workloads. Only one of these +allocators can be used. + +#### TCMalloc + +(Please update the binary location and name if different from default.) +- Install tcmalloc: `sudo apt-get install google-perftools` +- Modify the manifest template file: + - Add `loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"` + - Append below entries to `sgx.trusted_files`: + - `"file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"` + - `"file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"` +- Save the manifest template and rebuild this example. + +#### mimalloc + +(Please update the binary location and name if different from default.) +- Install mimalloc using the steps from https://github.com/microsoft/mimalloc +- Modify the manifest template file: + - Add the `/usr/local` FS mount point: + - `fs.mount.usr_local.type = "chroot"` + - `fs.mount.usr_local.path = "/usr/local"` + - `fs.mount.usr_local.uri = "file:/usr/local"` + - Add `loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"` + - Append below entry to `sgx.trusted_files`: + - `"file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"` +- Save the manifest template and rebuild this example. diff --git a/tensorflow/ResNet50/.gitignore b/tensorflow/ResNet50/.gitignore new file mode 100644 index 0000000..e3e7bc8 --- /dev/null +++ b/tensorflow/ResNet50/.gitignore @@ -0,0 +1,2 @@ +/models/ +/resnet50v1_5_int8_pretrained_model.pb diff --git a/tensorflow/ResNet50/Makefile b/tensorflow/ResNet50/Makefile new file mode 100755 index 0000000..6ce2252 --- /dev/null +++ b/tensorflow/ResNet50/Makefile @@ -0,0 +1,49 @@ +# ResNet50 sample for TensorFlow + +ARCH_LIBDIR ?= /lib/$(shell $(CC) -dumpmachine) +SGX_SIGNER_KEY ?= ../../../Pal/src/host/Linux-SGX/signer/enclave-key.pem + +ifeq ($(DEBUG),1) +GRAMINE_LOG_LEVEL = debug +else +GRAMINE_LOG_LEVEL = error +endif + +.PHONY: all collateral +all: python.manifest +ifeq ($(SGX),1) +all: python.manifest.sgx python.sig python.token +endif + +collateral: + test -d models || git clone https://github.com/IntelAI/models.git + test -f resnet50v1_5_int8_pretrained_model.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50v1_5_int8_pretrained_model.pb + +python.manifest: python.manifest.template collateral + gramine-manifest \ + -Dlog_level=$(GRAMINE_LOG_LEVEL) \ + -Darch_libdir=$(ARCH_LIBDIR) \ + -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ + -Dpythondistpath=$(PYTHONDISTPATH) \ + $< >$@ + +python.manifest.sgx: python.manifest + @test -s $(SGX_SIGNER_KEY) || \ + { echo "SGX signer private key was not found, please specify SGX_SIGNER_KEY!"; exit 1; } + gramine-sgx-sign \ + --key $(SGX_SIGNER_KEY) \ + --manifest python.manifest \ + --output $@ + +python.sig: python.manifest.sgx + +python.token: python.sig + gramine-sgx-get-token --output $@ --sig $< + +.PHONY: clean +clean: + $(RM) *.manifest *.manifest.sgx *.token *.sig + +.PHONY: distclean +distclean: clean + $(RM) -r models/ resnet50v1_5_int8_pretrained_model.pb diff --git a/tensorflow/ResNet50/python.manifest.template b/tensorflow/ResNet50/python.manifest.template new file mode 100755 index 0000000..6aef10d --- /dev/null +++ b/tensorflow/ResNet50/python.manifest.template @@ -0,0 +1,71 @@ +loader.entrypoint = "file:{{ gramine.libos }}" +libos.entrypoint = "{{ entrypoint }}" + +loader.log_level = "{{ log_level }}" + +loader.insecure__use_cmdline_argv = true +loader.insecure__use_host_env = true +loader.insecure__disable_aslr = true + +loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" + +loader.pal_internal_mem_size = "512M" + +fs.mount.lib.type = "chroot" +fs.mount.lib.path = "/lib" +fs.mount.lib.uri = "file:{{ gramine.runtimedir() }}" + +fs.mount.lib2.type = "chroot" +fs.mount.lib2.path = "{{ arch_libdir }}" +fs.mount.lib2.uri = "file:{{ arch_libdir }}" + +fs.mount.usr.type = "chroot" +fs.mount.usr.path = "/usr" +fs.mount.usr.uri = "file:/usr" + +fs.mount.bin.type = "chroot" +fs.mount.bin.path = "/bin" +fs.mount.bin.uri = "file:/bin" + +fs.mount.pyhome.type = "chroot" +fs.mount.pyhome.path = "{{ python.stdlib }}" +fs.mount.pyhome.uri = "file:{{ python.stdlib }}" + +fs.mount.pydisthome.type = "chroot" +fs.mount.pydisthome.path = "{{ python.distlib }}" +fs.mount.pydisthome.uri = "file:{{ python.distlib }}" + +fs.mount.pydistpath.type = "chroot" +fs.mount.pydistpath.path = "{{ pythondistpath }}" +fs.mount.pydistpath.uri = "file:{{ pythondistpath }}" + +fs.mount.tmp.type = "chroot" +fs.mount.tmp.path = "/tmp" +fs.mount.tmp.uri = "file:/tmp" + +fs.mount.etc.type = "chroot" +fs.mount.etc.path = "/etc" +fs.mount.etc.uri = "file:/etc" + +sgx.enclave_size = "32G" +sgx.thread_num = 300 +sgx.preheat_enclave = true +sgx.nonpie_binary = true + +sgx.trusted_files = [ + "file:{{ gramine.runtimedir() }}/", + "file:{{ arch_libdir }}/", + "file:/usr/{{ arch_libdir }}/", + "file:resnet50v1_5_int8_pretrained_model.pb", + "file:{{ gramine.libos }}", + "file:{{ entrypoint }}", + "file:{{ python.stdlib }}/", + "file:{{ python.distlib }}/", + "file:{{ pythondistpath }}/", +] + +sgx.allowed_files = [ + "file:/tmp/", + "file:/etc/", + "file:models/", +]