diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 00000000000..d78a3dc3455
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,19 @@
+Please use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) for usage, installation, or modeling questions, or other requests for help.
+_Do not post such requests to Issues._ Doing so interferes with the development of Caffe.
+
+Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue.
+
+### Issue summary
+
+
+### Steps to reproduce
+
+If you are having difficulty building Caffe or training a model, please ask the caffe-users mailing list. If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
+
+### Your system configuration
+Operating system:
+Compiler:
+CUDA version (if applicable):
+CUDNN version (if applicable):
+BLAS:
+Python or MATLAB version (for pycaffe and matcaffe respectively):
diff --git a/.gitignore b/.gitignore
index 53c1fb056bb..eff292b7f61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@
 # PyCharm files
 .idea
 
+# Visual Studio Code files
+.vscode
+
 # OSX dir files
 .DS_Store
 
@@ -81,6 +84,7 @@ cmake_build
 
 # Generated documentation
 docs/_site
+docs/_includes
 docs/gathered
 _site
 doxygen
diff --git a/.travis.yml b/.travis.yml
index 4dc7ed72d6c..3297954755d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,40 +1,52 @@
-# Use a build matrix to do two builds in parallel:
-# one using CMake, and one using make.
+dist: trusty
+sudo: required
+
+language: cpp
+compiler: gcc
+
 env:
+  global:
+    - NUM_THREADS=4
   matrix:
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3
-    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
+    # Use a build matrix to test many builds in parallel
+    # envvar defaults:
+    #   WITH_CMAKE: false
+    #   WITH_PYTHON3: false
+    #   WITH_IO: true
+    #   WITH_CUDA: false
+    #   WITH_CUDNN: false
+    - BUILD_NAME="default-make"
+#   - BUILD_NAME="python3-make" WITH_PYTHON3=true
+    - BUILD_NAME="no-io-make" WITH_IO=false
+    - BUILD_NAME="cuda-make" WITH_CUDA=true
+    - BUILD_NAME="cudnn-make" WITH_CUDA=true WITH_CUDNN=true
 
-language: cpp
+    - BUILD_NAME="default-cmake" WITH_CMAKE=true
+    - BUILD_NAME="python3-cmake" WITH_CMAKE=true WITH_PYTHON3=true
+    - BUILD_NAME="no-io-cmake" WITH_CMAKE=true WITH_IO=false
+    - BUILD_NAME="cuda-cmake" WITH_CMAKE=true WITH_CUDA=true
+    - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDA=true WITH_CUDNN=true
 
-# Cache Ubuntu apt packages.
 cache:
   apt: true
   directories:
-  - /home/travis/miniconda
-  - /home/travis/miniconda2
-  - /home/travis/miniconda3
-
-compiler: gcc
+    - ~/protobuf3
 
 before_install:
-  - export NUM_THREADS=4
-  - export SCRIPTS=./scripts/travis
-  - export CONDA_DIR="/home/travis/miniconda$PYTHON_VERSION"
+  - source ./scripts/travis/defaults.sh
 
 install:
-  - sudo -E $SCRIPTS/travis_install.sh
+  - sudo -E ./scripts/travis/install-deps.sh
+  - ./scripts/travis/setup-venv.sh ~/venv
+  - source ~/venv/bin/activate
+  - ./scripts/travis/install-python-deps.sh
 
 before_script:
-  - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64:$CONDA_DIR/lib
-  - export PATH=$CONDA_DIR/bin:$PATH
-  - if ! $WITH_CMAKE; then $SCRIPTS/travis_setup_makefile_config.sh; fi
+  - ./scripts/travis/configure.sh
 
-script: $SCRIPTS/travis_build_and_test.sh
+script:
+  - ./scripts/travis/build.sh
+  - ./scripts/travis/test.sh
 
 notifications:
 # Emails are sent to the committer's git-configured email address by default,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5d99cef9dd..08f56a33a59 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,15 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc3")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc3")
+set(CAFFE_TARGET_VERSION "1.0.0" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include(ExternalProject)
+include(GNUInstallDirs)
 
 include(cmake/Utils.cmake)
 include(cmake/Targets.cmake)
@@ -28,6 +29,7 @@ include(cmake/ConfigGen.cmake)
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
+caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which Python version to use")
@@ -38,6 +40,7 @@ caffe_option(USE_OPENCV "Build with OpenCV support" ON)
 caffe_option(USE_LEVELDB "Build with levelDB" ON)
 caffe_option(USE_LMDB "Build with lmdb" ON)
 caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
+caffe_option(USE_OPENMP "Link with OpenMP (when your BLAS wants OpenMP and you get linker errors)" OFF)
 
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
@@ -54,8 +57,6 @@ if(USE_libstdcpp)
   message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)")
 endif()
 
-add_definitions(-DGTEST_USE_OWN_TR1_TUPLE)
-
 # ---[ Warnings
 caffe_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized)
 
@@ -64,8 +65,26 @@ configure_file(cmake/Templates/caffe_config.h.in "${PROJECT_BINARY_DIR}/caffe_co
 
 # ---[ Includes
 set(Caffe_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR})
-include_directories(BEFORE src) # This is needed for gtest.
+set(Caffe_SRC_DIR ${PROJECT_SOURCE_DIR}/src)
+include_directories(${PROJECT_BINARY_DIR})
+
+# ---[ Includes & defines for CUDA
+
+# cuda_compile() does not have per-call dependencies or include pathes
+# (cuda_compile() has per-call flags, but we set them here too for clarity)
+#
+# list(REMOVE_ITEM ...) invocations remove PRIVATE and PUBLIC keywords from collected definitions and include pathes
+if(HAVE_CUDA)
+  # pass include pathes to cuda_include_directories()
+  set(Caffe_ALL_INCLUDE_DIRS ${Caffe_INCLUDE_DIRS})
+  list(REMOVE_ITEM Caffe_ALL_INCLUDE_DIRS PRIVATE PUBLIC)
+  cuda_include_directories(${Caffe_INCLUDE_DIR} ${Caffe_SRC_DIR} ${Caffe_ALL_INCLUDE_DIRS})
+
+  # add definitions to nvcc flags directly
+  set(Caffe_ALL_DEFINITIONS ${Caffe_DEFINITIONS})
+  list(REMOVE_ITEM Caffe_ALL_DEFINITIONS PRIVATE PUBLIC)
+  list(APPEND CUDA_NVCC_FLAGS ${Caffe_ALL_DEFINITIONS})
+endif()
 
 # ---[ Subdirectories
 add_subdirectory(src/gtest)
@@ -85,8 +104,19 @@ if(BUILD_python)
   add_dependencies(pytest pycaffe)
 endif()
 
+# ---[ uninstall target
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake)
+
 # ---[ Configuration summary
 caffe_print_configuration_summary()
 
 # ---[ Export configs generation
 caffe_generate_export_configs()
+
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8db66ea82c6..3fd767812e9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
 # Contributors
 
-Caffe is developed by a core set of BVLC members and the open-source community.
+Caffe is developed by a core set of BAIR members and the open-source community.
 
 We thank all of our [contributors](https://github.com/BVLC/caffe/graphs/contributors)!
 
diff --git a/LICENSE b/LICENSE
index d69d16f5bc7..0c99adc182c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,11 +1,11 @@
 COPYRIGHT
 
 All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 All rights reserved.
 
 All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
+Copyright (c) 2014-2017, the respective contributors
 All rights reserved.
 
 Caffe uses a shared copyright model: each contributor holds copyright over
diff --git a/Makefile b/Makefile
index 2f81aca84e7..4d324160c08 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc3
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
@@ -192,12 +192,12 @@ ifeq ($(USE_LMDB), 1)
 	LIBRARIES += lmdb
 endif
 ifeq ($(USE_OPENCV), 1)
-	LIBRARIES += opencv_core opencv_highgui opencv_imgproc 
+	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 
 	ifeq ($(OPENCV_VERSION), 3)
 		LIBRARIES += opencv_imgcodecs
 	endif
-		
+
 endif
 PYTHON_LIBRARIES ?= boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
@@ -272,7 +272,7 @@ endif
 ifeq ($(OSX), 1)
 	CXX := /usr/bin/clang++
 	ifneq ($(CPU_ONLY), 1)
-		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release \d' | grep -o '\d')
+		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]')
 		ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
 			CXXFLAGS += -stdlib=libstdc++
 			LINKFLAGS += -stdlib=libstdc++
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NCCL acceleration configuration
+ifeq ($(USE_NCCL), 1)
+	LIBRARIES += nccl
+	COMMON_FLAGS += -DUSE_NCCL
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
 	COMMON_FLAGS += -DUSE_OPENCV
@@ -364,9 +370,9 @@ ifeq ($(BLAS), mkl)
 	# MKL
 	LIBRARIES += mkl_rt
 	COMMON_FLAGS += -DUSE_MKL
-	MKL_DIR ?= /opt/intel/mkl
-	BLAS_INCLUDE ?= $(MKL_DIR)/include
-	BLAS_LIB ?= $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
+	MKLROOT ?= /opt/intel/mkl
+	BLAS_INCLUDE ?= $(MKLROOT)/include
+	BLAS_LIB ?= $(MKLROOT)/lib $(MKLROOT)/lib/intel64
 else ifeq ($(BLAS), open)
 	# OpenBLAS
 	LIBRARIES += openblas
@@ -382,8 +388,11 @@ else
 		LIBRARIES += cblas
 		# 10.10 has accelerate while 10.9 has veclib
 		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep 'version' | sed 's/[^0-9]*\([0-9]\).*/\1/')
+		XCODE_CLT_GEQ_7 := $(shell [ $(XCODE_CLT_VER) -gt 6 ] && echo 1)
 		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
-		ifeq ($(XCODE_CLT_GEQ_6), 1)
+		ifeq ($(XCODE_CLT_GEQ_7), 1)
+			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/$(shell ls /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/ | sort | tail -1)/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
+		else ifeq ($(XCODE_CLT_GEQ_6), 1)
 			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate
 		else
diff --git a/Makefile.config.example b/Makefile.config.example
index 8fd49c9c1a7..d552b38a97c 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -31,13 +31,17 @@ CUDA_DIR := /usr/local/cuda
 # CUDA_DIR := /usr
 
 # CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
+# For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
 CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \
 		-gencode arch=compute_35,code=sm_35 \
 		-gencode arch=compute_50,code=sm_50 \
-		-gencode arch=compute_50,code=compute_50
+		-gencode arch=compute_52,code=sm_52 \
+		-gencode arch=compute_60,code=sm_60 \
+		-gencode arch=compute_61,code=sm_61 \
+		-gencode arch=compute_61,code=compute_61
 
 # BLAS choice:
 # atlas for ATLAS (default)
@@ -68,7 +72,7 @@ PYTHON_INCLUDE := /usr/include/python2.7 \
 # ANACONDA_HOME := $(HOME)/anaconda
 # PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
 		# $(ANACONDA_HOME)/include/python2.7 \
-		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
 
 # Uncomment to use Python 3 (default is Python 2)
 # PYTHON_LIBRARIES := boost_python3 python3.5m
@@ -94,10 +98,15 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
 # INCLUDE_DIRS += $(shell brew --prefix)/include
 # LIBRARY_DIRS += $(shell brew --prefix)/lib
 
+# NCCL acceleration switch (uncomment to build with NCCL)
+# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
+# USE_NCCL := 1
+
 # Uncomment to use `pkg-config` to specify OpenCV library paths.
 # (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
 # USE_PKG_CONFIG := 1
 
+# N.B. both build and distribute dirs are cleared on `make clean`
 BUILD_DIR := build
 DISTRIBUTE_DIR := distribute
 
diff --git a/README.md b/README.md
index 44b9e62c157..fe259535865 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,25 @@
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu))/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 
 Check out the [project site](http://caffe.berkeleyvision.org) for all the details like
 
 - [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)
 - [Tutorial Documentation](http://caffe.berkeleyvision.org/tutorial/)
-- [BVLC reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
+- [BAIR reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
 - [Installation instructions](http://caffe.berkeleyvision.org/installation.html)
 
 and step-by-step examples.
 
+## Custom distributions
+
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, SKX, Xeon Phi).
+- [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
+- [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
+
+## Community
+
 [![Join the chat at https://gitter.im/BVLC/caffe](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/BVLC/caffe?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 Please join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) or [gitter chat](https://gitter.im/BVLC/caffe) to ask questions and talk about methods and models.
@@ -25,7 +33,7 @@ Happy brewing!
 ## License and Citation
 
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
-The BVLC reference models are released for unrestricted use.
+The BAIR/BVLC reference models are released for unrestricted use.
 
 Please cite Caffe in your publications if it helps your research:
 
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 056371110b5..09bb09b4ff2 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -1,31 +1,4 @@
 
-################################################################################################
-# Helper function to fetch caffe includes which will be passed to dependent projects
-# Usage:
-#   caffe_get_current_includes(<includes_list_variable>)
-function(caffe_get_current_includes includes_variable)
-  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
-  caffe_convert_absolute_paths(current_includes)
-
-  # remove at most one ${PROJECT_BINARY_DIR} include added for caffe_config.h
-  list(FIND current_includes ${PROJECT_BINARY_DIR} __index)
-  list(REMOVE_AT current_includes ${__index})
-
-  # removing numpy includes (since not required for client libs)
-  set(__toremove "")
-  foreach(__i ${current_includes})
-    if(${__i} MATCHES "python")
-      list(APPEND __toremove ${__i})
-    endif()
-  endforeach()
-  if(__toremove)
-    list(REMOVE_ITEM current_includes ${__toremove})
-  endif()
-
-  caffe_list_unique(current_includes)
-  set(${includes_variable} ${current_includes} PARENT_SCOPE)
-endfunction()
-
 ################################################################################################
 # Helper function to get all list items that begin with given prefix
 # Usage:
@@ -47,60 +20,24 @@ endfunction()
 function(caffe_generate_export_configs)
   set(install_cmake_suffix "share/Caffe")
 
-  # ---[ Configure build-tree CaffeConfig.cmake file ]---
-  caffe_get_current_includes(Caffe_INCLUDE_DIRS)
-
-  set(Caffe_DEFINITIONS "")
   if(NOT HAVE_CUDA)
     set(HAVE_CUDA FALSE)
-    list(APPEND Caffe_DEFINITIONS -DCPU_ONLY)
-  endif()
-
-  if(USE_OPENCV)
-    list(APPEND Caffe_DEFINITIONS -DUSE_OPENCV)
-  endif()
-
-  if(USE_LMDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LMDB)
-    if (ALLOW_LMDB_NOLOCK)
-        list(APPEND Caffe_DEFINITIONS -DALLOW_LMDB_NOLOCK)
-    endif()
-  endif()
-
-  if(USE_LEVELDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB)
   endif()
 
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
-  else()
-    list(APPEND DEFINITIONS -DUSE_CUDNN)
   endif()
 
-  if(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
-    list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
-  endif()
+  # ---[ Configure build-tree CaffeConfig.cmake file ]---
 
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
   # Add targets to the build-tree export set
-  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
+  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
   export(PACKAGE Caffe)
 
   # ---[ Configure install-tree CaffeConfig.cmake file ]---
 
-  # remove source and build dir includes
-  caffe_get_items_with_prefix(${PROJECT_SOURCE_DIR} Caffe_INCLUDE_DIRS __insource)
-  caffe_get_items_with_prefix(${PROJECT_BINARY_DIR} Caffe_INCLUDE_DIRS __inbinary)
-  list(REMOVE_ITEM Caffe_INCLUDE_DIRS ${__insource} ${__inbinary})
-
-  # add `install` include folder
-  set(lines
-     "get_filename_component(__caffe_include \"\${Caffe_CMAKE_DIR}/../../include\" ABSOLUTE)\n"
-     "list(APPEND Caffe_INCLUDE_DIRS \${__caffe_include})\n"
-     "unset(__caffe_include)\n")
-  string(REPLACE ";" "" Caffe_INSTALL_INCLUDE_DIR_APPEND_COMMAND ${lines})
-
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/cmake/CaffeConfig.cmake" @ONLY)
 
   # Install the CaffeConfig.cmake and export set to use with install-tree
@@ -109,7 +46,7 @@ function(caffe_generate_export_configs)
 
   # ---[ Configure and install version file ]---
 
-  # TODO: Lines below are commented because Caffe does't declare its version in headers.
+  # TODO: Lines below are commented because Caffe doesn't declare its version in headers.
   # When the declarations are added, modify `caffe_extract_caffe_version()` macro and uncomment
 
   # configure_file(cmake/Templates/CaffeConfigVersion.cmake.in "${PROJECT_BINARY_DIR}/CaffeConfigVersion.cmake" @ONLY)
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 286a42802b4..b2b19e8b669 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -4,7 +4,7 @@ endif()
 
 # Known NVIDIA GPU achitectures Caffe can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
-set(Caffe_known_gpu_archs "20 21(20) 30 35 50")
+set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61")
 
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -56,7 +56,7 @@ endfunction()
 #   caffe_select_nvcc_arch_flags(out_variable)
 function(caffe_select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "All" "Manual")
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
   set(__archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND __archs_names "Auto")
@@ -89,6 +89,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
     set(__cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(__cuda_arch_bin ${Caffe_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -174,11 +176,18 @@ function(detect_cuDNN)
             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE}
             DOC "Path to cuDNN include directory." )
 
-  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-  find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
-                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist}
-                             DOC "Path to cuDNN library.")
+  # dynamic libs have different suffix in mac and linux
+  if(APPLE)
+    set(CUDNN_LIB_NAME "libcudnn.dylib")
+  else()
+    set(CUDNN_LIB_NAME "libcudnn.so")
+  endif()
 
+  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+  find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME}
+   PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist} ${__libpath_hist}/../lib
+   DOC "Path to cuDNN library.")
+  
   if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
     set(HAVE_CUDNN  TRUE PARENT_SCOPE)
     set(CUDNN_FOUND TRUE PARENT_SCOPE)
@@ -231,17 +240,17 @@ endif()
 
 set(HAVE_CUDA TRUE)
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
-                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${CUDA_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${CUDA_CUDART_LIBRARY}
+                                     ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 
 # cudnn detection
 if(USE_CUDNN)
   detect_cuDNN()
   if(HAVE_CUDNN)
-    add_definitions(-DUSE_CUDNN)
-    include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND Caffe_LINKER_LIBS ${CUDNN_LIBRARY})
+    list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_CUDNN)
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${CUDNN_INCLUDE})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${CUDNN_LIBRARY})
   endif()
 endif()
 
@@ -275,7 +284,7 @@ mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
 if(APPLE)
   caffe_detect_darwin_version(OSX_VERSION)
 
-  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompatible with old CUDA toolkits
   if(OSX_VERSION VERSION_GREATER 10.8)
     # enabled by default if and only if CUDA version is less than 7.0
     caffe_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index c7b6a17aa69..c48255c89f2 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,57 +1,76 @@
-# This list is required for static linking and exported to CaffeConfig.cmake
+# These lists are later turned into target properties on main caffe library target
 set(Caffe_LINKER_LIBS "")
+set(Caffe_INCLUDE_DIRS "")
+set(Caffe_DEFINITIONS "")
+set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
-include_directories(SYSTEM ${Boost_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
+find_package(Boost 1.54 REQUIRED COMPONENTS system thread filesystem)
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
 # ---[ Threads
 find_package(Threads REQUIRED)
-list(APPEND Caffe_LINKER_LIBS ${CMAKE_THREAD_LIBS_INIT})
+list(APPEND Caffe_LINKER_LIBS PRIVATE ${CMAKE_THREAD_LIBS_INIT})
+
+# ---[ OpenMP
+if(USE_OPENMP)
+  # Ideally, this should be provided by the BLAS library IMPORTED target. However,
+  # nobody does this, so we need to link to OpenMP explicitly and have the maintainer
+  # to flick the switch manually as needed.
+  #
+  # Moreover, OpenMP package does not provide an IMPORTED target as well, and the
+  # suggested way of linking to OpenMP is to append to CMAKE_{C,CXX}_FLAGS.
+  # However, this naïve method will force any user of Caffe to add the same kludge
+  # into their buildsystem again, so we put these options into per-target PUBLIC
+  # compile options and link flags, so that they will be exported properly.
+  find_package(OpenMP REQUIRED)
+  list(APPEND Caffe_LINKER_LIBS PRIVATE ${OpenMP_CXX_FLAGS})
+  list(APPEND Caffe_COMPILE_OPTIONS PRIVATE ${OpenMP_CXX_FLAGS})
+endif()
 
 # ---[ Google-glog
 include("cmake/External/glog.cmake")
-include_directories(SYSTEM ${GLOG_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${GLOG_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${GLOG_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${GLOG_LIBRARIES})
 
 # ---[ Google-gflags
 include("cmake/External/gflags.cmake")
-include_directories(SYSTEM ${GFLAGS_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${GFLAGS_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${GFLAGS_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${GFLAGS_LIBRARIES})
 
 # ---[ Google-protobuf
 include(cmake/ProtoBuf.cmake)
 
 # ---[ HDF5
 find_package(HDF5 COMPONENTS HL REQUIRED)
-include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${HDF5_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
 # ---[ LMDB
 if(USE_LMDB)
   find_package(LMDB REQUIRED)
-  include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
-  add_definitions(-DUSE_LMDB)
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${LMDB_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${LMDB_LIBRARIES})
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LMDB)
   if(ALLOW_LMDB_NOLOCK)
-    add_definitions(-DALLOW_LMDB_NOLOCK)
+    list(APPEND Caffe_DEFINITIONS PRIVATE -DALLOW_LMDB_NOLOCK)
   endif()
 endif()
 
 # ---[ LevelDB
 if(USE_LEVELDB)
   find_package(LevelDB REQUIRED)
-  include_directories(SYSTEM ${LevelDB_INCLUDE})
-  list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
-  add_definitions(-DUSE_LEVELDB)
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${LevelDB_INCLUDES})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${LevelDB_LIBRARIES})
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LEVELDB)
 endif()
 
 # ---[ Snappy
 if(USE_LEVELDB)
   find_package(Snappy REQUIRED)
-  include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
+  list(APPEND Caffe_INCLUDE_DIRS PRIVATE ${Snappy_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PRIVATE ${Snappy_LIBRARIES})
 endif()
 
 # ---[ CUDA
@@ -63,8 +82,14 @@ if(NOT HAVE_CUDA)
     message(WARNING "-- CUDA is not detected by cmake. Building without it...")
   endif()
 
-  # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
-  add_definitions(-DCPU_ONLY)
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DCPU_ONLY)
+endif()
+
+if(USE_NCCL)
+  find_package(NCCL REQUIRED)
+  include_directories(SYSTEM ${NCCL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
+  add_definitions(-DUSE_NCCL)
 endif()
 
 # ---[ OpenCV
@@ -73,10 +98,10 @@ if(USE_OPENCV)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
-  include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-  list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${OpenCV_INCLUDE_DIRS})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${OpenCV_LIBS})
   message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
-  add_definitions(-DUSE_OPENCV)
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_OPENCV)
 endif()
 
 # ---[ BLAS
@@ -86,22 +111,28 @@ if(NOT APPLE)
 
   if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
     find_package(Atlas REQUIRED)
-    include_directories(SYSTEM ${Atlas_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${Atlas_LIBRARIES})
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Atlas_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${Atlas_LIBRARIES})
   elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
     find_package(OpenBLAS REQUIRED)
-    include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${OpenBLAS_LIB})
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${OpenBLAS_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${OpenBLAS_LIB})
   elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
     find_package(MKL REQUIRED)
-    include_directories(SYSTEM ${MKL_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${MKL_LIBRARIES})
-    add_definitions(-DUSE_MKL)
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${MKL_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${MKL_LIBRARIES})
+    list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_MKL)
   endif()
 elseif(APPLE)
   find_package(vecLib REQUIRED)
-  include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${vecLib_LINKER_LIBS})
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${vecLib_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${vecLib_LINKER_LIBS})
+
+  if(VECLIB_FOUND)
+    if(NOT vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_ACCELERATE)
+    endif()
+  endif()
 endif()
 
 # ---[ Python
@@ -113,18 +144,18 @@ if(BUILD_python)
     find_package(NumPy 1.7.1)
     # Find the matching boost python implementation
     set(version ${PYTHONLIBS_VERSION_STRING})
-    
+
     STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
     find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
     set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-    
+
     while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
       STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
-      
+
       STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
       find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
       set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-      
+
       STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
       if("${has_more_version}" STREQUAL "")
         break()
@@ -143,9 +174,9 @@ if(BUILD_python)
   if(PYTHONLIBS_FOUND AND NUMPY_FOUND AND Boost_PYTHON_FOUND)
     set(HAVE_PYTHON TRUE)
     if(BUILD_python_layer)
-      add_definitions(-DWITH_PYTHON_LAYER)
-      include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
-      list(APPEND Caffe_LINKER_LIBS ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
+      list(APPEND Caffe_DEFINITIONS PRIVATE -DWITH_PYTHON_LAYER)
+      list(APPEND Caffe_INCLUDE_DIRS PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} PUBLIC ${Boost_INCLUDE_DIRS})
+      list(APPEND Caffe_LINKER_LIBS PRIVATE ${PYTHON_LIBRARIES} PUBLIC ${Boost_LIBRARIES})
     endif()
   endif()
 endif()
diff --git a/cmake/External/glog.cmake b/cmake/External/glog.cmake
index a44672f2753..f9d0549cd90 100644
--- a/cmake/External/glog.cmake
+++ b/cmake/External/glog.cmake
@@ -37,6 +37,7 @@ if (NOT __GLOG_INCLUDED)
       GIT_TAG "v0.3.4"
       UPDATE_COMMAND ""
       INSTALL_DIR ${gflags_INSTALL}
+      PATCH_COMMAND autoreconf -i ${glog_PREFIX}/src/glog
       CONFIGURE_COMMAND env "CFLAGS=${GLOG_C_FLAGS}" "CXXFLAGS=${GLOG_CXX_FLAGS}" ${glog_PREFIX}/src/glog/configure --prefix=${glog_INSTALL} --enable-shared=no --enable-static=yes --with-gflags=${GFLAGS_LIBRARY_DIRS}/..
       LOG_DOWNLOAD 1
       LOG_CONFIGURE 1
diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 6e1564351c7..7ffa6393bbc 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -26,9 +26,9 @@ set(Atlas_LIB_SEARCH_PATHS
 find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
 find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
 
-find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                 PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES alapack_r alapack lapack_atlas  PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR
@@ -47,6 +47,6 @@ if(ATLAS_FOUND)
   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
   mark_as_advanced(${LOOKED_FOR})
 
-  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
 endif(ATLAS_FOUND)
 
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 00000000000..c8845934102
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,26 @@
+set(NCCL_INC_PATHS
+    /usr/include
+    /usr/local/include
+    $ENV{NCCL_DIR}/include
+    )
+
+set(NCCL_LIB_PATHS
+    /lib
+    /lib64
+    /usr/lib
+    /usr/lib64
+    /usr/local/lib
+    /usr/local/lib64
+    $ENV{NCCL_DIR}/lib
+    )
+
+find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
+find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+
+if (NCCL_FOUND)
+  message(STATUS "Found NCCL    (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+endif ()
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 9600da43647..4d44e613a00 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -12,11 +12,13 @@ endif()
 
 set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
 
+exec_program(xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
 find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
-          PATHS /System/Library/${__veclib_include_suffix}
-                /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/)
+          PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
+                /System/Library/${__veclib_include_suffix}
+                ${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+          NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(vecLib DEFAULT_MSG vecLib_INCLUDE_DIR)
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 73f647f5fae..8005b448707 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -2,8 +2,8 @@
 # the standard cmake script with version and python generation support
 
 find_package( Protobuf REQUIRED )
-include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${PROTOBUF_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${PROTOBUF_INCLUDE_DIR})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${PROTOBUF_LIBRARIES})
 
 # As of Ubuntu 14.04 protoc is no longer a part of libprotobuf-dev package
 # and should be installed separately as in: sudo apt-get install protobuf-compiler
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index ba025cf81e0..ed8c25268db 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
   caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
+  caffe_status("  USE_NCCL          :   ${USE_NCCL}")
   caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
   caffe_status("")
   caffe_status("Dependencies:")
diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake
index a796d00548f..090f86c5500 100644
--- a/cmake/Targets.cmake
+++ b/cmake/Targets.cmake
@@ -88,13 +88,13 @@ function(caffe_pickup_caffe_sources root)
   file(GLOB_RECURSE proto_files ${root}/src/caffe/*.proto)
   list(APPEND srcs ${proto_files})
 
-  # convet to absolute paths
+  # convert to absolute paths
   caffe_convert_absolute_paths(srcs)
   caffe_convert_absolute_paths(cuda)
   caffe_convert_absolute_paths(test_srcs)
   caffe_convert_absolute_paths(test_cuda)
 
-  # propogate to parent scope
+  # propagate to parent scope
   set(srcs ${srcs} PARENT_SCOPE)
   set(cuda ${cuda} PARENT_SCOPE)
   set(test_srcs ${test_srcs} PARENT_SCOPE)
@@ -102,7 +102,7 @@ function(caffe_pickup_caffe_sources root)
 endfunction()
 
 ################################################################################################
-# Short command for setting defeault target properties
+# Short command for setting default target properties
 # Usage:
 #   caffe_default_properties(<target>)
 function(caffe_default_properties target)
@@ -111,7 +111,7 @@ function(caffe_default_properties target)
     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
-  # make sure we build all external depepdencies first
+  # make sure we build all external dependencies first
   if (DEFINED external_project_dependencies)
     add_dependencies(${target} ${external_project_dependencies})
   endif()
diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index 73f57ac2d74..77c4059e560 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -9,9 +9,9 @@
 # After successful configuration the following variables
 # will be defined:
 #
-#   Caffe_INCLUDE_DIRS - Caffe include directories
-#   Caffe_LIBRARIES    - libraries to link against
-#   Caffe_DEFINITIONS  - a list of definitions to pass to compiler
+#   Caffe_LIBRARIES    - IMPORTED targets to link against
+#                        (There is no Caffe_INCLUDE_DIRS and Caffe_DEFINITIONS
+#                         because they are specified in the IMPORTED target interface.)
 #
 #   Caffe_HAVE_CUDA    - signals about CUDA support
 #   Caffe_HAVE_CUDNN   - signals about cuDNN support
@@ -27,7 +27,7 @@ if(@USE_OPENCV@)
 
       if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
         message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
-        include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
+	include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVConfig.cmake)
       endif()
 
     else()
@@ -39,9 +39,6 @@ endif()
 
 # Compute paths
 get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-set(Caffe_INCLUDE_DIRS "@Caffe_INCLUDE_DIRS@")
-
-@Caffe_INSTALL_INCLUDE_DIR_APPEND_COMMAND@
 
 # Our library dependencies
 if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
@@ -49,11 +46,9 @@ if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
 endif()
 
 # List of IMPORTED libs created by CaffeTargets.cmake
+# These targets already specify all needed definitions and include pathes
 set(Caffe_LIBRARIES caffe)
 
-# Definitions
-set(Caffe_DEFINITIONS "@Caffe_DEFINITIONS@")
-
 # Cuda support variables
 set(Caffe_CPU_ONLY @CPU_ONLY@)
 set(Caffe_HAVE_CUDA @HAVE_CUDA@)
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 8a31b43cabf..2080c63df36 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,35 +4,9 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
-/* NVIDA Cuda */
-#cmakedefine HAVE_CUDA
-
-/* NVIDA cuDNN */
-#cmakedefine HAVE_CUDNN
-#cmakedefine USE_CUDNN
-
-/* NVIDA cuDNN */
-#cmakedefine CPU_ONLY
+/* This is an absolute path so that we can run test from any build
+ * directory */
+#define ABS_TEST_DATA_DIR "${PROJECT_SOURCE_DIR}/src/caffe/test/test_data/"
 
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
-
-/* Temporary (TODO: remove) */
-#if 1
-  #define CMAKE_SOURCE_DIR SOURCE_FOLDER "/src/"
-  #define EXAMPLES_SOURCE_DIR BINARY_FOLDER "/examples/"
-  #define CMAKE_EXT ".gen.cmake"
-#else
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
-
-/* Matlab */
-#cmakedefine HAVE_MATLAB
-
-/* IO libraries */
-#cmakedefine USE_OPENCV
-#cmakedefine USE_LEVELDB
-#cmakedefine USE_LMDB
-#cmakedefine ALLOW_LMDB_NOLOCK
diff --git a/cmake/Uninstall.cmake.in b/cmake/Uninstall.cmake.in
new file mode 100644
index 00000000000..bb8e2964e46
--- /dev/null
+++ b/cmake/Uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
\ No newline at end of file
diff --git a/data/cifar10/get_cifar10.sh b/data/cifar10/get_cifar10.sh
index 623c848513e..423f10989c4 100755
--- a/data/cifar10/get_cifar10.sh
+++ b/data/cifar10/get_cifar10.sh
@@ -2,7 +2,7 @@
 # This scripts downloads the CIFAR10 (binary version) data and unzips it.
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 
diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh
index 90935f25099..dc0d0a72790 100755
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@@ -8,7 +8,7 @@
 # - the training splits with labels
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 
diff --git a/data/mnist/get_mnist.sh b/data/mnist/get_mnist.sh
index 6d875219489..ecadffa44f7 100755
--- a/data/mnist/get_mnist.sh
+++ b/data/mnist/get_mnist.sh
@@ -2,7 +2,7 @@
 # This scripts downloads the mnist data and unzips it.
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 
diff --git a/docker/Makefile b/docker/Makefile
deleted file mode 100644
index 725208c6b2b..00000000000
--- a/docker/Makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-# A makefile to build the docker images for caffe.
-# Two caffe images will be built:
-#   caffe:cpu --> A CPU-only build of caffe.
-#   caffe:gpu --> A GPU-enabled build using the latest CUDA and CUDNN versions.
-
-DOCKER ?= docker
-
-all: docker_files standalone
-
-.PHONY: standalone devel
-
-standalone: cpu_standalone gpu_standalone
-
-
-cpu_standalone: standalone/cpu/Dockerfile
-	$(DOCKER) build -t caffe:cpu standalone/cpu
-
-gpu_standalone: standalone/gpu/Dockerfile
-	$(DOCKER) build -t caffe:gpu standalone/gpu
-
-docker_files: standalone_files
-
-standalone_files: standalone/cpu/Dockerfile standalone/gpu/Dockerfile
-
-FROM_GPU = "nvidia/cuda:cudnn"
-FROM_CPU = "ubuntu:14.04"
-GPU_CMAKE_ARGS = -DUSE_CUDNN=1
-CPU_CMAKE_ARGS = -DCPU_ONLY=1
-
-# A make macro to select the CPU or GPU base image.
-define from_image
-$(if $(strip $(findstring gpu,$@)),$(FROM_GPU),$(FROM_CPU))
-endef
-
-# A make macro to select the CPU or GPU build args.
-define build_args
-$(if $(strip $(findstring gpu,$@)),$(GPU_CMAKE_ARGS),$(CPU_CMAKE_ARGS))
-endef
-
-# A make macro to construct the CPU or GPU Dockerfile from the template
-define create_docker_file
-	@echo creating $@
-	@echo "FROM "$(from_image) > $@
-	@cat $^ | sed 's/$${CMAKE_ARGS}/$(build_args)/' >> $@
-endef
-
-
-standalone/%/Dockerfile: templates/Dockerfile.template
-	$(create_docker_file)
-
diff --git a/docker/README.md b/docker/README.md
index fdab641bdca..f9c7c756fe6 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,52 +1,47 @@
-# Caffe standalone Dockerfiles.
+### Running an official image
 
-The `standalone` subfolder contains docker files for generating both CPU and GPU executable images for Caffe. The images can be built using make, or by running:
+You can run one of the automatic [builds](https://hub.docker.com/r/bvlc/caffe). E.g. for the CPU version:
 
-```
-docker build -t caffe:cpu standalone/cpu
-```
-for example. (Here `gpu` can be substituted for `cpu`, but to keep the readme simple, only the `cpu` case will be discussed in detail).
+`docker run -ti bvlc/caffe:cpu caffe --version`
 
-Note that the GPU standalone requires a CUDA 7.5 capable driver to be installed on the system and [nvidia-docker] for running the Docker containers. Here it is generally sufficient to use `nvidia-docker` instead of `docker` in any of the commands mentioned.
+or for GPU support (You need a CUDA 8.0 capable driver and
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)):
 
-# Running Caffe using the docker image
+`nvidia-docker run -ti bvlc/caffe:gpu caffe --version`
 
-In order to test the Caffe image, run:
-```
-docker run -ti caffe:cpu caffe --version
-```
-which should show a message like:
-```
-libdc1394 error: Failed to initialize libdc1394
-caffe version 1.0.0-rc3
-```
+You might see an error about libdc1394, ignore it.
 
-One can also build and run the Caffe tests in the image using:
-```
-docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
-```
+### Docker run options
 
-In order to get the most out of the caffe image, some more advanced `docker run` options could be used. For example, running:
-```
-docker run -ti --volume=$(pwd):/workspace caffe:cpu caffe train --solver=example_solver.prototxt
-```
-will train a network defined in the `example_solver.prototxt` file in the current directory (`$(pwd)` is maped to the container volume `/workspace` using the `--volume=` Docker flag).
+By default caffe runs as root, thus any output files, e.g. snapshots, will be owned
+by root. It also runs by default in a container-private folder.
 
-Note that docker runs all commands as root by default, and thus any output files (e.g. snapshots) generated will be owned by the root user. In order to ensure that the current user is used instead, the following command can be used:
-```
-docker run -ti --volume=$(pwd):/workspace -u $(id -u):$(id -g) caffe:cpu caffe train --solver=example_solver.prototxt
-```
-where the `-u` Docker command line option runs the commands in the container as the specified user, and the shell command `id` is used to determine the user and group ID of the current user. Note that the Caffe docker images have `/workspace` defined as the default working directory. This can be overridden using the `--workdir=` Docker command line option.
+You can change this using flags, like user (-u), current directory, and volumes (-w and -v).
+E.g. this behaves like the usual caffe executable:
 
-# Other use-cases
+`docker run --rm -u $(id -u):$(id -g) -v $(pwd):$(pwd) -w $(pwd) bvlc/caffe:cpu caffe train --solver=example_solver.prototxt`
 
-Although running the `caffe` command in the docker containers as described above serves many purposes, the container can also be used for more interactive use cases. For example, specifying `bash` as the command instead of `caffe` yields a shell that can be used for interactive tasks. (Since the caffe build requirements are included in the container, this can also be used to build and run local versions of caffe).
+Containers can also be used interactively, specifying e.g. `bash` or `ipython`
+instead of `caffe`.
 
-Another use case is to run python scripts that depend on `caffe`'s Python modules. Using the `python` command instead of `bash` or `caffe` will allow this, and an interactive interpreter can be started by running:
 ```
-docker run -ti caffe:cpu python
+docker run -ti bvlc/caffe:cpu ipython
+import caffe
+...
 ```
-(`ipython` is also available in the container).
 
-Since the `caffe/python` folder is also added to the path, the utility executable scripts defined there can also be used as executables. This includes `draw_net.py`, `classify.py`, and `detect.py`
+The caffe build requirements are included in the container, so this can be used to
+build and run custom versions of caffe. Also, `caffe/python` is in PATH, so python
+utilities can be used directly, e.g. `draw_net.py`, `classify.py`, or `detect.py`.
+
+### Building images yourself
+
+Examples:
+
+`docker build -t caffe:cpu cpu`
+
+`docker build -t caffe:gpu gpu`
+
+You can also build Caffe and run the tests in the image:
 
+`docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"`
diff --git a/docker/standalone/cpu/Dockerfile b/docker/cpu/Dockerfile
similarity index 73%
rename from docker/standalone/cpu/Dockerfile
rename to docker/cpu/Dockerfile
index 4fef25aa6a1..67e2e61bd57 100644
--- a/docker/standalone/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -1,5 +1,5 @@
-FROM ubuntu:14.04
-MAINTAINER caffe-maint@googlegroups.com
+FROM ubuntu:16.04
+LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -20,17 +20,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         python-dev \
         python-numpy \
         python-pip \
+        python-setuptools \
         python-scipy && \
     rm -rf /var/lib/apt/lists/*
 
 ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
+# FIXME: use ARG instead of ENV once DockerHub supports this
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    pip install --upgrade pip && \
+    cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
     mkdir build && cd build && \
     cmake -DCPU_ONLY=1 .. && \
     make -j"$(nproc)"
diff --git a/docker/standalone/gpu/Dockerfile b/docker/gpu/Dockerfile
similarity index 64%
rename from docker/standalone/gpu/Dockerfile
rename to docker/gpu/Dockerfile
index 1ddc6560d16..dcdbdf326fb 100644
--- a/docker/standalone/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,5 +1,5 @@
-FROM nvidia/cuda:cudnn
-MAINTAINER caffe-maint@googlegroups.com
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -20,19 +20,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         python-dev \
         python-numpy \
         python-pip \
+        python-setuptools \
         python-scipy && \
     rm -rf /var/lib/apt/lists/*
 
 ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
+# FIXME: use ARG instead of ENV once DockerHub supports this
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    pip install --upgrade pip && \
+    cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
+    git clone https://github.com/NVIDIA/nccl.git && cd nccl && make -j install && cd .. && rm -rf nccl && \
     mkdir build && cd build && \
-    cmake -DUSE_CUDNN=1 .. && \
+    cmake -DUSE_CUDNN=1 -DUSE_NCCL=1 .. && \
     make -j"$(nproc)"
 
 ENV PYCAFFE_ROOT $CAFFE_ROOT/python
diff --git a/docker/templates/Dockerfile.template b/docker/templates/Dockerfile.template
deleted file mode 100644
index 8834f057968..00000000000
--- a/docker/templates/Dockerfile.template
+++ /dev/null
@@ -1,42 +0,0 @@
-MAINTAINER caffe-maint@googlegroups.com
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cmake \
-        git \
-        wget \
-        libatlas-base-dev \
-        libboost-all-dev \
-        libgflags-dev \
-        libgoogle-glog-dev \
-        libhdf5-serial-dev \
-        libleveldb-dev \
-        liblmdb-dev \
-        libopencv-dev \
-        libprotobuf-dev \
-        libsnappy-dev \
-        protobuf-compiler \
-        python-dev \
-        python-numpy \
-        python-pip \
-        python-scipy && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CAFFE_ROOT=/opt/caffe
-WORKDIR $CAFFE_ROOT
-
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
-
-RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
-    mkdir build && cd build && \
-    cmake ${CMAKE_ARGS} .. && \
-    make -j"$(nproc)"
-
-ENV PYCAFFE_ROOT $CAFFE_ROOT/python
-ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
-ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
-RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-
-WORKDIR /workspace
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index b8efe60bc3b..3799e95afde 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -36,7 +36,7 @@
       <header>
         <h1 class="header"><a href="/">Caffe</a></h1>
         <p class="header">
-          Deep learning framework by the <a class="header name" href="http://bvlc.eecs.berkeley.edu/">BVLC</a>
+          Deep learning framework by <a class="header name" href="http://bair.berkeley.edu/">BAIR</a>
         </p>
         <p class="header">
           Created by
diff --git a/docs/development.md b/docs/development.md
index 107c2c3b281..ec05bbee102 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -4,7 +4,7 @@ title: Developing and Contributing
 # Development and Contributing
 
 Caffe is developed with active participation of the community.<br>
-The [BVLC](http://bvlc.eecs.berkeley.edu/) brewers welcome all contributions!
+The [BAIR](http://bair.berkeley.edu/)/BVLC brewers welcome all contributions!
 
 The exact details of contributions are recorded by versioning and cited in our [acknowledgements](http://caffe.berkeleyvision.org/#acknowledgements).
 This method is impartial and always up-to-date.
@@ -37,7 +37,7 @@ We absolutely appreciate any contribution to this effort!
 
 The `master` branch receives all new development including community contributions.
 We try to keep it in a reliable state, but it is the bleeding edge, and things do get broken every now and then.
-BVLC maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
+BAIR maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
 
 #### Issues & Pull Request Protocol
 
diff --git a/docs/index.md b/docs/index.md
index 932b3b58d1d..b633f7cfddc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,7 @@ title: Deep Learning Framework
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and by community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu)) and by community contributors.
 [Yangqing Jia](http://daggerfs.com) created the project during his PhD at UC Berkeley.
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
 
@@ -23,21 +23,20 @@ Thanks to these contributors the framework tracks the state-of-the-art in both c
 
 **Speed** makes Caffe perfect for research experiments and industry deployment.
 Caffe can process **over 60M images per day** with a single NVIDIA K40 GPU\*.
-That's 1 ms/image for inference and 4 ms/image for learning.
-We believe that Caffe is the fastest convnet implementation available.
+That's 1 ms/image for inference and 4 ms/image for learning and more recent library versions and hardware are faster still.
+We believe that Caffe is among the fastest convnet implementations available.
 
 **Community**: Caffe already powers academic research projects, startup prototypes, and even large-scale industrial applications in vision, speech, and multimedia.
 Join our community of brewers on the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) and [Github](https://github.com/BVLC/caffe/).
 
 <p class="footnote" markdown="1">
-\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and caching IO.
-Consult performance [details](/performance_hardware.html).
+\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and prefetching IO.
 </p>
 
 ## Documentation
 
-- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)<br>
-Tutorial presentation.
+- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p) and [Caffe in a Day](https://docs.google.com/presentation/d/1HxGdeq8MPktHaPb-rlmYYQ723iWzq9ur6Gjo71YiG0Y/edit#slide=id.gc2fcdcce7_216_0)<br>
+Tutorial presentation of the framework and a full-day crash course.
 - [Tutorial Documentation](/tutorial)<br>
 Practical guide and framework reference.
 - [arXiv / ACM MM '14 paper](http://arxiv.org/abs/1408.5093)<br>
@@ -45,18 +44,13 @@ A 4-page report for the ACM Multimedia Open Source competition (arXiv:1408.5093v
 - [Installation instructions](/installation.html)<br>
 Tested on Ubuntu, Red Hat, OS X.
 * [Model Zoo](/model_zoo.html)<br>
-BVLC suggests a standard distribution format for Caffe models, and provides trained models.
+BAIR suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
-
-### Examples
-
-{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
-{% for page in examples %}
-- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
-{% endfor %}
+* [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
+Comparison of inference and learning for different networks and GPUs.
 
 ### Notebook Examples
 
@@ -65,6 +59,13 @@ Developer documentation automagically generated from code comments.
 - <div><a href="http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/{{page.original_path}}">{{page.title}}</a><br>{{page.description}}</div>
 {% endfor %}
 
+### Command Line Examples
+
+{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
+{% for page in examples %}
+- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
+{% endfor %}
+
 ## Citing Caffe
 
 Please cite Caffe in your publications if it helps your research:
@@ -76,8 +77,7 @@ Please cite Caffe in your publications if it helps your research:
       Year = {2014}
     }
 
-If you do publish a paper where Caffe helped your research, we encourage you to update the [publications wiki](https://github.com/BVLC/caffe/wiki/Publications).
-Citations are also tracked automatically by [Google Scholar](http://scholar.google.com/scholar?oi=bibs&hl=en&cites=17333247995453974016).
+If you do publish a paper where Caffe helped your research, we encourage you to cite the framework for tracking by [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-ltRSM0AAAAJ:u5HHmVD_uO8C).
 
 ## Contacting Us
 
@@ -85,17 +85,12 @@ Join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users
 
 Framework development discussions and thorough bug reports are collected on [Issues](https://github.com/BVLC/caffe/issues).
 
-Contact [caffe-dev](mailto:caffe-dev@googlegroups.com) if you have a confidential proposal for the framework *and the ability to act on it*.
-Requests for features, explanations, or personal help will be ignored; post to [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) instead.
-
-The core Caffe developers offer [consulting services](mailto:caffe-coldpress@googlegroups.com) for appropriate projects.
-
 ## Acknowledgements
 
-The BVLC Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
+The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
-The BVLC members who have contributed to Caffe are (alphabetical by first name):
-[Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
+The BAIR members who have contributed to Caffe are (alphabetical by first name):
+[Carl Doersch](http://www.carldoersch.com/), [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Philipp Krähenbühl](http://www.philkr.net/), [Ronghang Hu](http://ronghanghu.com/), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), [Takuya Narihira](https://github.com/tnarihi), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
 Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for recent activity and the [contributors](https://github.com/BVLC/caffe/graphs/contributors) for the full list.
@@ -103,4 +98,4 @@ Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for re
 We sincerely appreciate your interest and contributions!
 If you'd like to contribute, please read the [developing & contributing](development.html) guide.
 
-Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
+Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
diff --git a/docs/install_apt.md b/docs/install_apt.md
index 2976e3cd07c..b6cb1c2d6f7 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -1,22 +1,59 @@
 ---
-title: Installation: Ubuntu
+title: "Installation: Ubuntu"
 ---
 
 # Ubuntu Installation
 
+### For Ubuntu (>= 17.04)
+
+**Installing pre-compiled Caffe**
+
+Everything including caffe itself is packaged in 17.04 and higher versions.
+To install pre-compiled Caffe package, just do it by
+
+    sudo apt install caffe-cpu
+
+for CPU-only version, or
+
+    sudo apt install caffe-cuda
+
+for CUDA version. Note, the cuda version may break if your NVIDIA driver
+and CUDA toolkit are not installed by APT.
+
+[Package status of CPU-only version](https://launchpad.net/ubuntu/+source/caffe)
+
+[Package status of CUDA version](https://launchpad.net/ubuntu/+source/caffe-contrib)
+
+**Installing Caffe from source**
+
+We may install the dependencies by merely one line
+
+    sudo apt build-dep caffe-cpu        # dependencies for CPU-only version
+    sudo apt build-dep caffe-cuda       # dependencies for CUDA version
+
+It requires a `deb-src` line in your `sources.list`.
+Continue with [compilation](installation.html#compilation).
+
+### For Ubuntu (\< 17.04)
+
 **General dependencies**
 
     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
     sudo apt-get install --no-install-recommends libboost-all-dev
 
-**CUDA**: Install via the NVIDIA package instead of `apt-get` to be certain of the library and driver versions.
-Install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
+**CUDA**: Install by `apt-get` or the NVIDIA `.run` package.
+The NVIDIA package tends to follow more recent library and driver versions, but the installation is more manual.
+If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
-**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
+**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS by `sudo apt-get install libopenblas-dev` or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 
+**Compatibility notes, 16.04**
+
+CUDA 8 is required on Ubuntu 16.04.
+
 **Remaining dependencies, 14.04**
 
 Everything is packaged in 14.04.
@@ -28,8 +65,8 @@ Everything is packaged in 14.04.
 These dependencies need manual installation in 12.04.
 
     # glog
-    wget https://google-glog.googlecode.com/files/glog-0.3.3.tar.gz
-    tar zxvf glog-0.3.3.tar.gz
+    wget https://github.com/google/glog/archive/v0.3.3.tar.gz
+    tar zxvf v0.3.3.tar.gz
     cd glog-0.3.3
     ./configure
     make && make install
diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
new file mode 100644
index 00000000000..0a6a3b962e5
--- /dev/null
+++ b/docs/install_apt_debian.md
@@ -0,0 +1,163 @@
+---
+title: "Installation: Debian"
+---
+
+# Debian Installation
+
+Caffe packages are available for several Debian versions, as shown in the
+following chart:
+
+```
+Your Distro     |  CPU_ONLY  |  CUDA  | Codename
+----------------+------------+--------+-------------------
+Debian/oldstable|     ✘      |   ✘    | Jessie (8.0)
+Debian/stable   |     ✔      |   ✔    | Stretch (9.0)
+Debian/testing  |     ✔      |   ✔    | Buster
+Debian/unstable |     ✔      |   ✔    | Buster
+```
+
+* `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
+
+* `✔ ` You can install caffe with a single command line following this guide.
+
+* [Package status of CPU-only version](https://tracker.debian.org/pkg/caffe)
+
+* [Package status of CUDA version](https://tracker.debian.org/pkg/caffe-contrib)
+
+Last update: 2017-07-08
+
+## Binary installation with APT
+
+Apart from the installation methods based on source, Debian users can install
+pre-compiled Caffe packages from the official archive with APT.
+
+Make sure that your `/etc/apt/sources.list` contains `contrib` and `non-free`
+sections if you want to install the CUDA version, for instance:
+
+```
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+```
+
+Then we update APT cache and directly install Caffe. Note, the cpu version and
+the cuda version cannot coexist.
+
+```
+$ sudo apt update
+$ sudo apt install [ caffe-cpu | caffe-cuda ]
+$ caffe                                              # command line interface working
+$ python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
+```
+
+These Caffe packages should work for you out of box. However, the CUDA version
+may break if your NVIDIA driver and CUDA toolkit are not installed with APT.
+
+#### Customizing caffe packages
+
+Some users may need to customize the Caffe package. The way to customize
+the package is beyond this guide. Here is only a brief guide of producing
+the customized `.deb` packages. 
+
+Make sure that there is a `dec-src` source in your `/etc/apt/sources.list`,
+for instance:
+
+```
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+deb-src http://ftp2.cn.debian.org/debian sid main contrib non-free
+```
+
+Then we build caffe deb files with the following commands:
+
+```
+$ sudo apt update
+$ sudo apt install build-essential debhelper devscripts  # standard package building tools
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]          # the most elegant way to pull caffe build dependencies
+$ apt source [ caffe-cpu | caffe-cuda ]                  # download the source tarball and extract
+$ cd caffe-XXXX
+[ ... optional, customizing caffe code/build ... ]
+$ dch --local "Modified XXX"                             # bump package version and write changelog
+$ debuild -B -j4                                         # build caffe with 4 parallel jobs (similar to make -j4)
+[ ... building ...]
+$ debc                                                   # optional, if you want to check the package contents
+$ sudo debi                                              # optional, install the generated packages
+$ ls ../                                                 # optional, you will see the resulting packages
+```
+
+It is a BUG if the package failed to build without any change.
+The changelog will be installed at e.g. `/usr/share/doc/caffe-cpu/changelog.Debian.gz`.
+
+## Source installation
+
+Source installation under Debian/unstable and Debian/testing is similar to that of Ubuntu, but
+here is a more elegant way to pull caffe build dependencies:
+
+```
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]
+```
+
+Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
+
+#### Compiler Combinations
+
+Some users may find their favorate compiler doesn't work with CUDA.
+
+```
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |  CUDA 9.0  |
+-------------+------------+------------+------------+
+GCC-8        |     ?      |     ?      |     ?      |
+GCC-7        |     ?      |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |     ✔      |
+GCC-5        |     ✔ [1]  |     ✔      |     ✔      |
+-------------+------------+------------+------------+
+CLANG-4.0    |     ?      |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |     ✔      |
+CLANG-3.8    |     ?      |     ✔      |     ✔      |
+```
+
+`[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
+
+`[2]` CUDA 9.0: https://devblogs.nvidia.com/parallelforall/cuda-9-features-revealed/
+
+BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
+You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
+(See https://wiki.debian.org/GCC5 )
+
+## Notes
+
+* Consider re-compiling OpenBLAS locally with optimization flags for sake of
+performance. This is highly recommended for any kind of production use, including
+academic research.
+
+* If you are installing `caffe-cuda`, APT will automatically pull some of the
+CUDA packages and the nvidia driver packages. Please be careful if you have
+manually installed or hacked nvidia driver or CUDA toolkit or any other
+related stuff, because in this case APT may fail.
+
+* Additionally, a manpage (`man caffe`) and a bash complementation script
+(`caffe <TAB><TAB>`, `caffe train <TAB><TAB>`) are provided.
+Both of the two files are still not merged into caffe master.
+
+* The python interface is Python 3 version: `python3-caffe-{cpu,cuda}`.
+No plan to support python2.
+
+* If you encountered any problem related to the packaging system (e.g. failed to install `caffe-*`),
+please report bug to Debian via Debian's bug tracking system. See https://www.debian.org/Bugs/ .
+Patches and suggestions are also welcome.
+
+## FAQ
+
+* where is caffe-cudnn?
+
+CUDNN library seems not redistributable currently. If you really want the
+caffe-cudnn deb packages, the workaround is to install cudnn by yourself,
+and hack the packaging scripts, then build your customized package.
+
+* I installed the CPU version. How can I switch to the CUDA version?
+
+`sudo apt install caffe-cuda`, apt's dependency resolver is smart enough to deal with this.
+
+* Where are the examples, the models and other documentation stuff?
+
+```
+$ sudo apt install caffe-doc
+$ dpkg -L caffe-doc
+```
diff --git a/docs/install_osx.md b/docs/install_osx.md
index 6405d8ad046..a2da82f0fb2 100644
--- a/docs/install_osx.md
+++ b/docs/install_osx.md
@@ -1,5 +1,5 @@
 ---
-title: Installation: OS X
+title: "Installation: OS X"
 ---
 
 # OS X Installation
diff --git a/docs/install_yum.md b/docs/install_yum.md
index 2104912e482..842fbd64177 100644
--- a/docs/install_yum.md
+++ b/docs/install_yum.md
@@ -1,5 +1,5 @@
 ---
-title: Installation: RHEL / Fedora / CentOS
+title: "Installation: RHEL / Fedora / CentOS"
 ---
 
 # RHEL / Fedora / CentOS Installation
@@ -15,7 +15,7 @@ title: Installation: RHEL / Fedora / CentOS
 **Remaining dependencies, if not found**
 
     # glog
-    wget https://google-glog.googlecode.com/files/glog-0.3.3.tar.gz
+    wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/google-glog/glog-0.3.3.tar.gz
     tar zxvf glog-0.3.3.tar.gz
     cd glog-0.3.3
     ./configure
diff --git a/docs/installation.md b/docs/installation.md
index 893164584d9..42f1d0ce09b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -5,13 +5,25 @@ title: Installation
 # Installation
 
 Prior to installing, have a glance through this guide and take note of the details for your platform.
-We install and run Caffe on Ubuntu 14.04 and 12.04, OS X 10.10 / 10.9 / 10.8, and AWS.
-The official Makefile and `Makefile.config` build are complemented by an automatic CMake build from the community.
+We install and run Caffe on Ubuntu 16.04–12.04, OS X 10.11–10.8, and through Docker and AWS.
+The official Makefile and `Makefile.config` build are complemented by a [community CMake build](#cmake-build).
+
+**Step-by-step Instructions**:
+
+- [Docker setup](https://github.com/BVLC/caffe/tree/master/docker) *out-of-the-box brewing*
+- [Ubuntu installation](install_apt.html) *the standard platform*
+- [Debian installation](install_apt_debian.html) *install caffe with a single command*
+- [OS X installation](install_osx.html)
+- [RHEL / CentOS / Fedora installation](install_yum.html)
+- [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
+- [OpenCL](https://github.com/BVLC/caffe/tree/opencl) *see the OpenCL branch led by Fabian Tschopp*
+- [AWS AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-caffe) *pre-configured for AWS*
+
+**Overview**:
 
 - [Prerequisites](#prerequisites)
 - [Compilation](#compilation)
 - [Hardware](#hardware)
-- Platforms: [Ubuntu guide](install_apt.html), [OS X guide](install_osx.html), and [RHEL / CentOS / Fedora guide](install_yum.html)
 
 When updating Caffe, it's best to `make clean` before re-compiling.
 
@@ -20,7 +32,7 @@ When updating Caffe, it's best to `make clean` before re-compiling.
 Caffe has several dependencies:
 
 * [CUDA](https://developer.nvidia.com/cuda-zone) is required for GPU mode.
-    * library version 7.0 and the latest driver version are recommended, but 6.* is fine too
+    * library version 7+ and the latest driver version are recommended, but 6.* is fine too
     * 5.5, and 5.0 are compatible but considered legacy
 * [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) via ATLAS, MKL, or OpenBLAS.
 * [Boost](http://www.boost.org/) >= 1.55
@@ -30,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v3)
+* cuDNN for GPU acceleration (v6)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v3; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
@@ -52,7 +64,7 @@ Caffe requires BLAS as the backend of its matrix and vector computations.
 There are several implementations of this library. The choice is yours:
 
 * [ATLAS](http://math-atlas.sourceforge.net/): free, open source, and so the default for Caffe.
-* [Intel MKL](http://software.intel.com/en-us/intel-mkl): commercial and optimized for Intel CPUs, with a free trial and [student](http://software.intel.com/en-us/intel-education-offerings) licenses.
+* [Intel MKL](http://software.intel.com/en-us/intel-mkl): commercial and optimized for Intel CPUs, with [free](https://registrationcenter.intel.com/en/forms/?productid=2558) licenses.
     1. Install MKL.
     2. Set up MKL environment (Details: [Linux](https://software.intel.com/en-us/node/528499), [OS X](https://software.intel.com/en-us/node/528659)). Example: *source /opt/intel/mkl/bin/mklvars.sh intel64*
     3. Set `BLAS := mkl` in `Makefile.config`
@@ -82,10 +94,6 @@ Install MATLAB, and make sure that its `mex` is in your `$PATH`.
 
 *Caffe's MATLAB interface works with versions 2015a, 2014a/b, 2013a/b, and 2012b.*
 
-#### Windows
-
-There is an unofficial Windows port of Caffe at [niuzhiheng/caffe:windows](https://github.com/niuzhiheng/caffe). Thanks [@niuzhiheng](https://github.com/niuzhiheng)!
-
 ## Compilation
 
 Caffe can be compiled with either Make or CMake. Make is officially supported while CMake is supported by the community.
@@ -113,7 +121,7 @@ Be sure to set your MATLAB and Python paths in `Makefile.config` first!
 
 Now that you have installed Caffe, check out the [MNIST tutorial](gathered/examples/mnist.html) and the [reference ImageNet model tutorial](gathered/examples/imagenet.html).
 
-### Compilation with CMake
+### CMake Build
 
 In lieu of manually editing `Makefile.config` to configure the build, Caffe offers an unofficial CMake build thanks to @Nerei, @akosiorek, and other members of the community. It requires CMake version >= 2.8.7.
 The basic steps are as follows:
@@ -129,9 +137,9 @@ See [PR #1667](https://github.com/BVLC/caffe/pull/1667) for options and details.
 
 ## Hardware
 
-**Laboratory Tested Hardware**: Berkeley Vision runs Caffe with K40s, K20s, and Titans including models at ImageNet/ILSVRC scale. We also run on GTX series cards (980s and 770s) and GPU-equipped MacBook Pros. We have not encountered any trouble in-house with devices with CUDA capability >= 3.0. All reported hardware issues thus-far have been due to GPU configuration, overheating, and the like.
+**Laboratory Tested Hardware**: Berkeley Vision runs Caffe with Titan Xs, K80s, GTX 980s, K40s, K20s, Titans, and GTX 770s including models at ImageNet/ILSVRC scale. We have not encountered any trouble in-house with devices with CUDA capability >= 3.0. All reported hardware issues thus-far have been due to GPU configuration, overheating, and the like.
 
-**CUDA compute capability**: devices with compute capability <= 2.0 may have to reduce CUDA thread numbers and batch sizes due to hardware constraints. Your mileage may vary.
+**CUDA compute capability**: devices with compute capability <= 2.0 may have to reduce CUDA thread numbers and batch sizes due to hardware constraints. Brew with caution; we recommend compute capability >= 3.0.
 
 Once installed, check your times against our [reference performance numbers](performance_hardware.html) to make sure everything is configured properly.
 
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 06dc0a49ec7..3f77e82572c 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -3,7 +3,7 @@ title: Model Zoo
 ---
 # Caffe Model Zoo
 
-Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data.
+Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data: check out the [model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)!
 These models are learned and applied for problems ranging from simple regression, to large-scale visual classification, to Siamese networks for image similarity, to speech and robotics applications.
 
 To help share these models, we introduce the model zoo framework:
@@ -14,17 +14,17 @@ To help share these models, we introduce the model zoo framework:
 
 ## Where to get trained models
 
-First of all, we bundle BVLC-trained models for unrestricted, out of the box use.
+First of all, we bundle BAIR-trained models for unrestricted, out of the box use.
 <br>
-See the [BVLC model license](#bvlc-model-license) for details.
+See the [BAIR model license](#bair-model-license) for details.
 Each one of these can be downloaded by running `scripts/download_model_binary.py <dirname>` where `<dirname>` is specified below:
 
-- **BVLC Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
-- **BVLC AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
-- **BVLC Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
-- **BVLC GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
+- **BAIR Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
+- **BAIR AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
+- **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
+- **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
-**Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
+**Community models** made by Caffe users are posted to a publicly editable [model zoo wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
 Thank you for sharing your models!
 
@@ -42,6 +42,8 @@ A caffe model is distributed as a directory containing:
     - License information.
 - [optional] Other helpful scripts.
 
+This simple format can be handled through bundled scripts or manually if need be.
+
 ### Hosting model info
 
 Github Gist is a good format for model info distribution because it can contain multiple files, is versionable, and has in-browser syntax highlighting and markdown rendering.
@@ -55,14 +57,14 @@ Downloading model info is done just as easily with `scripts/download_model_from_
 ### Hosting trained models
 
 It is up to the user where to host the `.caffemodel` file.
-We host our BVLC-provided models on our own server.
+We host our BAIR-provided models on our own server.
 Dropbox also works fine (tip: make sure that `?dl=1` is appended to the end of the URL).
 
 `scripts/download_model_binary.py <dirname>` downloads the `.caffemodel` from the URL specified in the `<dirname>/readme.md` frontmatter and confirms SHA1.
 
-## BVLC model license
+## BAIR model license
 
-The Caffe models bundled by the BVLC are released for unrestricted use.
+The Caffe models bundled by the BAIR are released for unrestricted use.
 
 These models are trained on data from the [ImageNet project](http://www.image-net.org/) and training data includes internet photos that may be subject to copyright.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
index d91acef980d..e04ebb0b7c8 100644
--- a/docs/multigpu.md
+++ b/docs/multigpu.md
@@ -13,7 +13,7 @@ The GPUs to be used for training can be set with the "-gpu" flag on the command
 # Hardware Configuration Assumptions
 
 The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3.
 
 For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
 
@@ -23,4 +23,4 @@ Current implementation has a "soft" assumption that the devices being used are h
 
 # Scaling Performance
 
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
\ No newline at end of file
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
deleted file mode 100644
index cdd4b361dea..00000000000
--- a/docs/performance_hardware.md
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: Performance and Hardware Configuration
----
-
-# Performance and Hardware Configuration
-
-To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
-
-For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
-
-**Acknowledgements**: BVLC members are very grateful to NVIDIA for providing several GPUs to conduct this research.
-
-## NVIDIA K40
-
-Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
-
-Best settings with ECC off and maximum clock speed in standard Caffe:
-
-* Training is 26.5 secs / 20 iterations (5,120 images)
-* Testing is 100 secs / validation set (50,000 images)
-
-Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
-
-* Training is 19.2 secs / 20 iterations (5,120 images)
-* Testing is 60.7 secs / validation set (50,000 images)
-
-Other settings:
-
-* ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
-* ECC on, default speed: training 31 secs / 20 iterations, test 117 secs / validation set
-* ECC off, default speed: training 31 secs / 20 iterations, test 118 secs / validation set
-
-### K40 configuration tips
-
-For maximum K40 performance, turn off ECC and boost the clock speed (at your own risk).
-
-To turn off ECC, do
-
-    sudo nvidia-smi -i 0 --ecc-config=0    # repeat with -i x for each GPU ID
-
-then reboot.
-
-Set the "persistence" mode of the GPU settings by
-
-    sudo nvidia-smi -pm 1
-
-and then set the clock speed with
-
-    sudo nvidia-smi -i 0 -ac 3004,875    # repeat with -i x for each GPU ID
-
-but note that this configuration resets across driver reloading / rebooting. Include these commands in a boot script to initialize these settings. For a simple fix, add these commands to `/etc/rc.local` (on Ubuntu).
-
-## NVIDIA Titan
-
-Training: 26.26 secs / 20 iterations (5,120 images).
-Testing: 100 secs / validation set (50,000 images).
-
-cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
-cuDNN Testing: 66.3 secs / validation set (50,000 images).
-
-
-## NVIDIA K20
-
-Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images).
-
-## NVIDIA GTX 770
-
-Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images).
-
-cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
-cuDNN Testing: 104 secs / validation set (50,000 images).
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index d7ff378239d..b5a4f1ad069 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -91,7 +91,7 @@ In MatCaffe, you can
 * Run for a certain number of iterations and give back control to Matlab
 * Intermingle arbitrary Matlab code with gradient steps
 
-An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
+An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
 
 ### Build MatCaffe
 
@@ -114,7 +114,7 @@ You can save your Matlab search PATH by running `savepath` so that you don't hav
 
 MatCaffe is very similar to PyCaffe in usage.
 
-Examples below shows detailed usages and assumes you have downloaded BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
+Examples below shows detailed usages and assumes you have downloaded BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
 
     model = './models/bvlc_reference_caffenet/deploy.prototxt';
     weights = './models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 7362aac298a..2faacc5836d 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -1,186 +1,77 @@
 ---
 title: Layer Catalogue
 ---
+
 # Layers
 
 To create a Caffe model you need to define the model architecture in a protocol buffer definition file (prototxt).
 
 Caffe layers and their parameters are defined in the protocol buffer definitions for the project in [caffe.proto](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto).
 
-### Vision Layers
-
-* Header: `./include/caffe/vision_layers.hpp`
-
-Vision layers usually take *images* as input and produce other *images* as output.
-A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
-But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
-This 2D geometry naturally lends itself to certain decisions about how to process the input.
-In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
-In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
-
-
-#### Convolution
-
-* Layer type: `Convolution`
-* CPU implementation: `./src/caffe/layers/convolution_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/convolution_layer.cu`
-* Parameters (`ConvolutionParameter convolution_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Strongly Recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "conv1"
-        type: "Convolution"
-        bottom: "data"
-        top: "conv1"
-        # learning rate and decay multipliers for the filters
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        convolution_param {
-          num_output: 96     # learn 96 filters
-          kernel_size: 11    # each filter is 11x11
-          stride: 4          # step 4 pixels between each filter application
-          weight_filler {
-            type: "gaussian" # initialize the filters from a Gaussian
-            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
-          }
-          bias_filler {
-            type: "constant" # initialize the biases to zero (0)
-            value: 0
-          }
-        }
-      }
-
-The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
-
-#### Pooling
-
-* Layer type: `Pooling`
-* CPU implementation: `./src/caffe/layers/pooling_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/pooling_layer.cu`
-* Parameters (`PoolingParameter pooling_param`)
-    - Required
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Optional
-        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-* Input
-    - `n * c * h_i * w_i`
-* Output
-    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "pool1"
-        type: "Pooling"
-        bottom: "conv1"
-        top: "pool1"
-        pooling_param {
-          pool: MAX
-          kernel_size: 3 # pool over a 3x3 region
-          stride: 2      # step two pixels (in the bottom blob) between pooling regions
-        }
-      }
-
-#### Local Response Normalization (LRN)
-
-* Layer type: `LRN`
-* CPU Implementation: `./src/caffe/layers/lrn_layer.cpp`
-* CUDA GPU Implementation: `./src/caffe/layers/lrn_layer.cu`
-* Parameters (`LRNParameter lrn_param`)
-    - Optional
-        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
-        - `alpha` [default 1]: the scaling parameter (see below)
-        - `beta` [default 5]: the exponent (see below)
-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+## Data Layers
 
-The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
-
-#### im2col
-
-`Im2col` is a helper for doing the image-to-column transformation that you most likely do not need to know about. This is used in Caffe's original convolution to do matrix multiplication by laying out all patches into a matrix.
-
-### Loss Layers
+Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
 
-Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
+Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s by some of the layers.
+The [bias](layers/bias.html), [scale](layers/scale.html), and [crop](layers/crop.html) layers can be helpful with transforming the inputs, when `TransformationParameter` isn't available.
 
-#### Softmax
+Layers:
 
-* Layer type: `SoftmaxWithLoss`
+* [Image Data](layers/imagedata.html) - read raw images.
+* [Database](layers/data.html) - read data from LEVELDB or LMDB.
+* [HDF5 Input](layers/hdf5data.html) - read HDF5 data, allows data of arbitrary dimensions.
+* [HDF5 Output](layers/hdf5output.html) - write data as HDF5.
+* [Input](layers/input.html) - typically used for networks that are being deployed.
+* [Window Data](layers/windowdata.html) - read window data file.
+* [Memory Data](layers/memorydata.html) - read data directly from memory.
+* [Dummy Data](layers/dummydata.html) - for static data and debugging.
 
-The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+Note that the [Python](layers/python.html) Layer can be useful for create custom data layers.
 
-#### Sum-of-Squares / Euclidean
+## Vision Layers
 
-* Layer type: `EuclideanLoss`
+Vision layers usually take *images* as input and produce other *images* as output, although they can take data of other types and dimensions.
+A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
+But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
+This 2D geometry naturally lends itself to certain decisions about how to process the input.
+In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
+In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
 
-The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+Layers:
 
-#### Hinge / Margin
+* [Convolution Layer](layers/convolution.html) - convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+* [Pooling Layer](layers/pooling.html) - max, average, or stochastic pooling.
+* [Spatial Pyramid Pooling (SPP)](layers/spp.html)
+* [Crop](layers/crop.html) - perform cropping transformation.
+* [Deconvolution Layer](layers/deconvolution.html) - transposed convolution.
 
-* Layer type: `HingeLoss`
-* CPU implementation: `./src/caffe/layers/hinge_loss_layer.cpp`
-* CUDA GPU implementation: none yet
-* Parameters (`HingeLossParameter hinge_loss_param`)
-    - Optional
-        - `norm` [default L1]: the norm used. Currently L1, L2
-* Inputs
-    - `n * c * h * w` Predictions
-    - `n * 1 * 1 * 1` Labels
-* Output
-    - `1 * 1 * 1 * 1` Computed Loss
-* Samples
+* [Im2Col](layers/im2col.html) - relic helper layer that is not used much anymore.
 
-      # L1 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-      }
+## Recurrent Layers
 
-      # L2 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-        top: "loss"
-        hinge_loss_param {
-          norm: L2
-        }
-      }
+Layers:
 
-The hinge loss layer computes a one-vs-all hinge or squared hinge loss.
+* [Recurrent](layers/recurrent.html)
+* [RNN](layers/rnn.html)
+* [Long-Short Term Memory (LSTM)](layers/lstm.html)
 
-#### Sigmoid Cross-Entropy
+## Common Layers
 
-`SigmoidCrossEntropyLoss`
+Layers:
 
-#### Infogain
+* [Inner Product](layers/innerproduct.html) - fully connected layer.
+* [Dropout](layers/dropout.html)
+* [Embed](layers/embed.html) - for learning embeddings of one-hot encoded vector (takes index as input).
 
-`InfogainLoss`
+## Normalization Layers
 
-#### Accuracy and Top-k
+* [Local Response Normalization (LRN)](layers/lrn.html) - performs a kind of "lateral inhibition" by normalizing over local input regions.
+* [Mean Variance Normalization (MVN)](layers/mvn.html) - performs contrast normalization / instance normalization.
+* [Batch Normalization](layers/batchnorm.html) - performs normalization over mini-batches.
 
-`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+The [bias](layers/bias.html) and [scale](layers/scale.html) layers can be helpful in combination with normalization.
 
-### Activation / Neuron Layers
+## Activation / Neuron Layers
 
 In general, activation / Neuron layers are element-wise operators, taking one bottom blob and producing one top blob of the same size. In the layers below, we will ignore the input and out sizes as they are identical:
 
@@ -189,337 +80,56 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
 * Output
     - n * c * h * w
 
-#### ReLU / Rectified-Linear and Leaky-ReLU
-
-* Layer type: `ReLU`
-* CPU implementation: `./src/caffe/layers/relu_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/relu_layer.cu`
-* Parameters (`ReLUParameter relu_param`)
-    - Optional
-        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "relu1"
-        type: "ReLU"
-        bottom: "conv1"
-        top: "conv1"
-      }
-
-Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
-
-#### Sigmoid
-
-* Layer type: `Sigmoid`
-* CPU implementation: `./src/caffe/layers/sigmoid_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/sigmoid_layer.cu`
-* Sample (as seen in `./examples/mnist/mnist_autoencoder.prototxt`)
-
-      layer {
-        name: "encode1neuron"
-        bottom: "encode1"
-        top: "encode1neuron"
-        type: "Sigmoid"
-      }
-
-The `Sigmoid` layer computes the output as sigmoid(x) for each input element x.
-
-#### TanH / Hyperbolic Tangent
-
-* Layer type: `TanH`
-* CPU implementation: `./src/caffe/layers/tanh_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/tanh_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "TanH"
-      }
-
-The `TanH` layer computes the output as tanh(x) for each input element x.
-
-#### Absolute Value
-
-* Layer type: `AbsVal`
-* CPU implementation: `./src/caffe/layers/absval_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/absval_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "AbsVal"
-      }
-
-The `AbsVal` layer computes the output as abs(x) for each input element x.
-
-#### Power
-
-* Layer type: `Power`
-* CPU implementation: `./src/caffe/layers/power_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/power_layer.cu`
-* Parameters (`PowerParameter power_param`)
-    - Optional
-        - `power` [default 1]
-        - `scale` [default 1]
-        - `shift` [default 0]
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "Power"
-        power_param {
-          power: 1
-          scale: 1
-          shift: 0
-        }
-      }
-
-The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
-
-#### BNLL
-
-* Layer type: `BNLL`
-* CPU implementation: `./src/caffe/layers/bnll_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/bnll_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: BNLL
-      }
-
-The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
-
-
-### Data Layers
-
-Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
-
-Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s.
-
-#### Database
+Layers:
 
-* Layer type: `Data`
-* Parameters
-    - Required
-        - `source`: the name of the directory containing the database
-        - `batch_size`: the number of inputs to process at one time
-    - Optional
-        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
-        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+* [ReLU / Rectified-Linear and Leaky-ReLU](layers/relu.html) - ReLU and Leaky-ReLU rectification.
+* [PReLU](layers/prelu.html) - parametric ReLU.
+* [ELU](layers/elu.html) - exponential linear rectification.
+* [Sigmoid](layers/sigmoid.html)
+* [TanH](layers/tanh.html)
+* [Absolute Value](layers/abs.html)
+* [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
+* [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
+* [Log](layers/log.html) - f(x) = log(x).
+* [BNLL](layers/bnll.html) - f(x) = log(1 + exp(x)).
+* [Threshold](layers/threshold.html) - performs step function at user defined threshold.
+* [Bias](layers/bias.html) - adds a bias to a blob that can either be learned or fixed.
+* [Scale](layers/scale.html) - scales a blob by an amount that can either be learned or fixed.
 
+## Utility Layers
 
+Layers:
 
-#### In-Memory
+* [Flatten](layers/flatten.html)
+* [Reshape](layers/reshape.html)
+* [Batch Reindex](layers/batchreindex.html)
 
-* Layer type: `MemoryData`
-* Parameters
-    - Required
-        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
+* [Split](layers/split.html)
+* [Concat](layers/concat.html)
+* [Slicing](layers/slice.html)
+* [Eltwise](layers/eltwise.html) - element-wise operations such as product or sum between two blobs.
+* [Filter / Mask](layers/filter.html) - mask or select output using last blob.
+* [Parameter](layers/parameter.html) - enable parameters to be shared between layers.
+* [Reduction](layers/reduction.html) - reduce input blob to scalar blob using operations such as sum or mean.
+* [Silence](layers/silence.html) - prevent top-level blobs from being printed during training.
 
-The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+* [ArgMax](layers/argmax.html)
+* [Softmax](layers/softmax.html)
 
-#### HDF5 Input
+* [Python](layers/python.html) - allows custom Python layers.
 
-* Layer type: `HDF5Data`
-* Parameters
-    - Required
-        - `source`: the name of the file to read from
-        - `batch_size`
+## Loss Layers
 
-#### HDF5 Output
-
-* Layer type: `HDF5Output`
-* Parameters
-    - Required
-        - `file_name`: name of file to write to
-
-The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
-
-#### Images
-
-* Layer type: `ImageData`
-* Parameters
-    - Required
-        - `source`: name of a text file, with each line giving an image filename and label
-        - `batch_size`: number of images to batch together
-    - Optional
-        - `rand_skip`
-        - `shuffle` [default false]
-        - `new_height`, `new_width`: if provided, resize all images to this size
-
-#### Windows
-
-`WindowData`
-
-#### Dummy
-
-`DummyData` is for development and debugging. See `DummyDataParameter`.
-
-### Common Layers
-
-#### Inner Product
-
-* Layer type: `InnerProduct`
-* CPU implementation: `./src/caffe/layers/inner_product_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/inner_product_layer.cu`
-* Parameters (`InnerProductParameter inner_product_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-    - Strongly recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_filler` [default `type: 'constant' value: 0`]
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * 1 * 1`
-* Sample
-
-      layer {
-        name: "fc8"
-        type: "InnerProduct"
-        # learning rate and decay multipliers for the weights
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        inner_product_param {
-          num_output: 1000
-          weight_filler {
-            type: "gaussian"
-            std: 0.01
-          }
-          bias_filler {
-            type: "constant"
-            value: 0
-          }
-        }
-        bottom: "fc7"
-        top: "fc8"
-      }
-
-The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
-
-#### Splitting
-
-The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
-
-#### Flattening
-
-The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`
-
-#### Reshape
-
-* Layer type: `Reshape`
-* Implementation: `./src/caffe/layers/reshape_layer.cpp`
-* Parameters (`ReshapeParameter reshape_param`)
-    - Optional: (also see detailed description below)
-        - `shape`
-
-* Input
-    - a single blob with arbitrary dimensions
-* Output
-    - the same blob, with modified dimensions, as specified by `reshape_param`
-
-* Sample
-
-        layer {
-          name: "reshape"
-          type: "Reshape"
-          bottom: "input"
-          top: "output"
-          reshape_param {
-            shape {
-              dim: 0  # copy the dimension from below
-              dim: 2
-              dim: 3
-              dim: -1 # infer it from the other dimensions
-            }
-          }
-        }
-
-The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
-
-Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
-
-* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
-* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
-
-As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
-
-#### Concatenation
-
-* Layer type: `Concat`
-* CPU implementation: `./src/caffe/layers/concat_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/concat_layer.cu`
-* Parameters (`ConcatParameter concat_param`)
-    - Optional
-        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
-* Input
-    - `n_i * c_i * h * w` for each input blob i from 1 to K.
-* Output
-    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
-    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
-* Sample
-
-      layer {
-        name: "concat"
-        bottom: "in1"
-        bottom: "in2"
-        top: "out"
-        type: "Concat"
-        concat_param {
-          axis: 1
-        }
-      }
-
-The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
-
-#### Slicing
-
-The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
-
-* Sample
-
-      layer {
-        name: "slicer_label"
-        type: "Slice"
-        bottom: "label"
-        ## Example of label with a shape N x 3 x 1 x 1
-        top: "label1"
-        top: "label2"
-        top: "label3"
-        slice_param {
-          axis: 1
-          slice_point: 1
-          slice_point: 2
-        }
-      }
-
-`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
-
-
-#### Elementwise Operations
-
-`Eltwise`
-
-#### Argmax
-
-`ArgMax`
-
-#### Softmax
+Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
 
-`Softmax`
+Layers:
 
-#### Mean-Variance Normalization
+* [Multinomial Logistic Loss](layers/multinomiallogisticloss.html)
+* [Infogain Loss](layers/infogainloss.html) - a generalization of MultinomialLogisticLossLayer.
+* [Softmax with Loss](layers/softmaxwithloss.html) - computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+* [Sum-of-Squares / Euclidean](layers/euclideanloss.html) - computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+* [Hinge / Margin](layers/hingeloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
+* [Sigmoid Cross-Entropy Loss](layers/sigmoidcrossentropyloss.html) - computes the cross-entropy (logistic) loss, often used for predicting targets interpreted as probabilities.
+* [Accuracy / Top-k layer](layers/accuracy.html) - scores the output as an accuracy with respect to target -- it is not actually a loss and has no backward step.
+* [Contrastive Loss](layers/contrastiveloss.html)
 
-`MVN`
diff --git a/docs/tutorial/layers/absval.md b/docs/tutorial/layers/absval.md
new file mode 100644
index 00000000000..220c41189be
--- /dev/null
+++ b/docs/tutorial/layers/absval.md
@@ -0,0 +1,22 @@
+---
+title: Absolute Value Layer
+---
+
+# Absolute Value Layer
+
+* Layer type: `AbsVal`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AbsValLayer.html)
+* Header: [`./include/caffe/layers/absval_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/absval_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/absval_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/absval_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cu)
+
+* Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "AbsVal"
+      }
+
+The `AbsVal` layer computes the output as abs(x) for each input element x.
diff --git a/docs/tutorial/layers/accuracy.md b/docs/tutorial/layers/accuracy.md
new file mode 100644
index 00000000000..80293b1c6bf
--- /dev/null
+++ b/docs/tutorial/layers/accuracy.md
@@ -0,0 +1,20 @@
+---
+title: Accuracy and Top-k
+---
+
+# Accuracy and Top-k
+
+`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+
+* Layer type: `Accuracy`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AccuracyLayer.html)
+* Header: [`./include/caffe/layers/accuracy_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/accuracy_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/accuracy_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cpp)
+
+## Parameters
+* Parameters (`AccuracyParameter accuracy_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/AccuracyParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/argmax.md b/docs/tutorial/layers/argmax.md
new file mode 100644
index 00000000000..9eb8b7739f5
--- /dev/null
+++ b/docs/tutorial/layers/argmax.md
@@ -0,0 +1,18 @@
+---
+title: ArgMax Layer
+---
+
+# ArgMax Layer
+
+* Layer type: `ArgMax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ArgMaxLayer.html)
+* Header: [`./include/caffe/layers/argmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/argmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/argmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cpp)
+
+## Parameters
+* Parameters (`ArgMaxParameter argmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ArgMaxParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/batchnorm.md b/docs/tutorial/layers/batchnorm.md
new file mode 100644
index 00000000000..a5be5ce08bf
--- /dev/null
+++ b/docs/tutorial/layers/batchnorm.md
@@ -0,0 +1,20 @@
+---
+title: Batch Norm Layer
+---
+
+# Batch Norm Layer
+
+* Layer type: `BatchNorm`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchNormLayer.html)
+* Header: [`./include/caffe/layers/batch_norm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_norm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_norm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_norm_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cu)
+
+## Parameters
+
+* Parameters (`BatchNormParameter batch_norm_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/batchreindex.md b/docs/tutorial/layers/batchreindex.md
new file mode 100644
index 00000000000..21b36c39ba5
--- /dev/null
+++ b/docs/tutorial/layers/batchreindex.md
@@ -0,0 +1,16 @@
+---
+title: Batch Reindex Layer
+---
+
+# Batch Reindex Layer
+
+* Layer type: `BatchReindex`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchReindexLayer.html)
+* Header: [`./include/caffe/layers/batch_reindex_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_reindex_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_reindex_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_reindex_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cu)
+
+
+## Parameters
+
+No parameters.
diff --git a/docs/tutorial/layers/bias.md b/docs/tutorial/layers/bias.md
new file mode 100644
index 00000000000..d3a00c2fc78
--- /dev/null
+++ b/docs/tutorial/layers/bias.md
@@ -0,0 +1,19 @@
+---
+title: Bias Layer
+---
+
+# Bias Layer
+
+* Layer type: `Bias`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BiasLayer.html)
+* Header: [`./include/caffe/layers/bias_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bias_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bias_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bias_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cu)
+
+## Parameters
+* Parameters (`BiasParameter bias_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/BiasParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/bnll.md b/docs/tutorial/layers/bnll.md
new file mode 100644
index 00000000000..2b68b79ff83
--- /dev/null
+++ b/docs/tutorial/layers/bnll.md
@@ -0,0 +1,25 @@
+---
+title: BNLL Layer
+---
+
+# BNLL Layer
+
+* Layer type: `BNLL`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BNLLLayer.html)
+* Header: [`./include/caffe/layers/bnll_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bnll_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bnll_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bnll_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cu)
+
+The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
+
+## Parameters
+No parameters.
+
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: BNLL
+      }
diff --git a/docs/tutorial/layers/concat.md b/docs/tutorial/layers/concat.md
new file mode 100644
index 00000000000..c7b253953d7
--- /dev/null
+++ b/docs/tutorial/layers/concat.md
@@ -0,0 +1,40 @@
+---
+title: Concat Layer
+---
+
+# Concat Layer
+
+* Layer type: `Concat`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConcatLayer.html)
+* Header: [`./include/caffe/layers/concat_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/concat_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/concat_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/concat_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu)
+* Input
+    - `n_i * c_i * h * w` for each input blob i from 1 to K.
+* Output
+    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
+    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
+* Sample
+
+      layer {
+        name: "concat"
+        bottom: "in1"
+        bottom: "in2"
+        top: "out"
+        type: "Concat"
+        concat_param {
+          axis: 1
+        }
+      }
+
+The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
+
+## Parameters
+* Parameters (`ConcatParameter concat_param`)
+    - Optional
+        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConcatParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/contrastiveloss.md b/docs/tutorial/layers/contrastiveloss.md
new file mode 100644
index 00000000000..bb1859d9f37
--- /dev/null
+++ b/docs/tutorial/layers/contrastiveloss.md
@@ -0,0 +1,20 @@
+---
+title: Contrastive Loss Layer
+---
+
+# Contrastive Loss Layer
+
+* Layer type: `ContrastiveLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ContrastiveLossLayer.html)
+* Header: [`./include/caffe/layers/contrastive_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/contrastive_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cu)
+
+## Parameters
+
+* Parameters (`ContrastiveLossParameter contrastive_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ContrastiveLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/convolution.md b/docs/tutorial/layers/convolution.md
new file mode 100644
index 00000000000..cc9f4fd0449
--- /dev/null
+++ b/docs/tutorial/layers/convolution.md
@@ -0,0 +1,63 @@
+---
+title: Convolution Layer
+---
+
+# Convolution Layer
+
+* Layer type: `Convolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConvolutionLayer.html)
+* Header: [`./include/caffe/layers/conv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/conv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/conv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/conv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
+
+The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+
+## Sample
+
+Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt)):
+
+      layer {
+        name: "conv1"
+        type: "Convolution"
+        bottom: "data"
+        top: "conv1"
+        # learning rate and decay multipliers for the filters
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        convolution_param {
+          num_output: 96     # learn 96 filters
+          kernel_size: 11    # each filter is 11x11
+          stride: 4          # step 4 pixels between each filter application
+          weight_filler {
+            type: "gaussian" # initialize the filters from a Gaussian
+            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
+          }
+          bias_filler {
+            type: "constant" # initialize the biases to zero (0)
+            value: 0
+          }
+        }
+      }
+
+## Parameters
+* Parameters (`ConvolutionParameter convolution_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Strongly Recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/crop.md b/docs/tutorial/layers/crop.md
new file mode 100644
index 00000000000..28f91241f74
--- /dev/null
+++ b/docs/tutorial/layers/crop.md
@@ -0,0 +1,20 @@
+---
+title: Crop Layer
+---
+
+# Crop Layer
+
+* Layer type: `Crop`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1CropLayer.html)
+* Header: [`./include/caffe/layers/crop_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/crop_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/crop_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/crop_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cu)
+
+## Parameters
+
+* Parameters (`CropParameter crop_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/CropParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/data.md b/docs/tutorial/layers/data.md
new file mode 100644
index 00000000000..58e0dcaab22
--- /dev/null
+++ b/docs/tutorial/layers/data.md
@@ -0,0 +1,29 @@
+---
+title: Database Layer
+---
+
+# Database Layer
+
+* Layer type: `Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DataLayer.html)
+* Header: [`./include/caffe/layers/data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DataParameter data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `source`: the name of the directory containing the database
+        - `batch_size`: the number of inputs to process at one time
+    - Optional
+        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
+        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+
diff --git a/docs/tutorial/layers/deconvolution.md b/docs/tutorial/layers/deconvolution.md
new file mode 100644
index 00000000000..2eff967d613
--- /dev/null
+++ b/docs/tutorial/layers/deconvolution.md
@@ -0,0 +1,22 @@
+---
+title: Deconvolution Layer
+---
+
+# Deconvolution Layer
+
+* Layer type: `Deconvolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DeconvolutionLayer.html)
+* Header: [`./include/caffe/layers/deconv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/deconv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/deconv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/deconv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cu)
+
+## Parameters
+
+Uses the same parameters as the Convolution layer.
+
+* Parameters (`ConvolutionParameter convolution_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dropout.md b/docs/tutorial/layers/dropout.md
new file mode 100644
index 00000000000..d8c6f9556be
--- /dev/null
+++ b/docs/tutorial/layers/dropout.md
@@ -0,0 +1,20 @@
+---
+title: Dropout Layer
+---
+
+# Dropout Layer
+
+* Layer type: `Dropout`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DropoutLayer.html)
+* Header: [`./include/caffe/layers/dropout_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dropout_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dropout_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/dropout_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cu)
+
+## Parameters
+
+* Parameters (`DropoutParameter dropout_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DropoutParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dummydata.md b/docs/tutorial/layers/dummydata.md
new file mode 100644
index 00000000000..d069f9c595e
--- /dev/null
+++ b/docs/tutorial/layers/dummydata.md
@@ -0,0 +1,20 @@
+---
+title: Dummy Data Layer
+---
+
+# Dummy Data Layer
+
+* Layer type: `DummyData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DummyDataLayer.html)
+* Header: [`./include/caffe/layers/dummy_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dummy_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dummy_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dummy_data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DummyDataParameter dummy_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DummyDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/eltwise.md b/docs/tutorial/layers/eltwise.md
new file mode 100644
index 00000000000..70fe7910c5a
--- /dev/null
+++ b/docs/tutorial/layers/eltwise.md
@@ -0,0 +1,20 @@
+---
+title: Eltwise Layer
+---
+
+# Eltwise Layer
+
+* Layer type: `Eltwise`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EltwiseLayer.html)
+* Header: [`./include/caffe/layers/eltwise_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/eltwise_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/eltwise_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/eltwise_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cu)
+
+## Parameters
+
+* Parameters (`EltwiseParameter eltwise_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/EltwiseParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/elu.md b/docs/tutorial/layers/elu.md
new file mode 100644
index 00000000000..11db0f0e3d6
--- /dev/null
+++ b/docs/tutorial/layers/elu.md
@@ -0,0 +1,25 @@
+---
+title: ELU Layer
+---
+
+# ELU Layer
+
+* Layer type: `ELU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ELULayer.html)
+* Header: [`./include/caffe/layers/elu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/elu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/elu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/elu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cu)
+
+## References
+
+* Clevert, Djork-Arne, Thomas Unterthiner, and Sepp Hochreiter.
+  "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)" [arXiv:1511.07289](https://arxiv.org/abs/1511.07289). (2015).
+
+## Parameters
+
+* Parameters (`ELUParameter elu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ELUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/embed.md b/docs/tutorial/layers/embed.md
new file mode 100644
index 00000000000..271636d8d97
--- /dev/null
+++ b/docs/tutorial/layers/embed.md
@@ -0,0 +1,20 @@
+---
+title: Embed Layer
+---
+
+# Embed Layer
+
+* Layer type: `Embed`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EmbedLayer.html)
+* Header: [`./include/caffe/layers/embed_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/embed_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/embed_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/embed_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cu)
+
+## Parameters
+
+* Parameters (`EmbedParameter embed_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/EmbedParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/euclideanloss.md b/docs/tutorial/layers/euclideanloss.md
new file mode 100644
index 00000000000..c1b72084c14
--- /dev/null
+++ b/docs/tutorial/layers/euclideanloss.md
@@ -0,0 +1,16 @@
+---
+title: Euclidean Loss Layer
+---
+# Sum-of-Squares / Euclidean Loss Layer
+
+* Layer type: `EuclideanLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EuclideanLossLayer.html)
+* Header: [`./include/caffe/layers/euclidean_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/euclidean_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cu)
+
+The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/exp.md b/docs/tutorial/layers/exp.md
new file mode 100644
index 00000000000..ef2500ec214
--- /dev/null
+++ b/docs/tutorial/layers/exp.md
@@ -0,0 +1,24 @@
+---
+title: Exponential Layer
+---
+
+# Exponential Layer
+
+* Layer type: `Exp`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ExpLayer.html)
+* Header: [`./include/caffe/layers/exp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/exp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/exp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/exp_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter exp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ExpParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Power layer](power.html)
diff --git a/docs/tutorial/layers/filter.md b/docs/tutorial/layers/filter.md
new file mode 100644
index 00000000000..aeda9ee66f8
--- /dev/null
+++ b/docs/tutorial/layers/filter.md
@@ -0,0 +1,15 @@
+---
+title: Filter Layer
+---
+
+# Filter Layer
+
+* Layer type: `Filter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FilterLayer.html)
+* Header: [`./include/caffe/layers/filter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/filter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/filter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/filter_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cu)
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/flatten.md b/docs/tutorial/layers/flatten.md
new file mode 100644
index 00000000000..ecf08262707
--- /dev/null
+++ b/docs/tutorial/layers/flatten.md
@@ -0,0 +1,21 @@
+---
+title: Flatten Layer
+---
+
+# Flatten Layer
+
+* Layer type: `Flatten`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FlattenLayer.html)
+* Header: [`./include/caffe/layers/flatten_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/flatten_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/flatten_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/flatten_layer.cpp)
+
+The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`.
+
+## Parameters
+
+* Parameters (`FlattenParameter flatten_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/FlattenParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5data.md b/docs/tutorial/layers/hdf5data.md
new file mode 100644
index 00000000000..d6b7ea24d2e
--- /dev/null
+++ b/docs/tutorial/layers/hdf5data.md
@@ -0,0 +1,20 @@
+---
+title: HDF5 Data Layer
+---
+
+# HDF5 Data Layer
+
+* Layer type: `HDF5Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5DataLayer.html)
+* Header: [`./include/caffe/layers/hdf5_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_data_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cu)
+
+## Parameters
+
+* Parameters (`HDF5DataParameter hdf5_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5DataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5output.md b/docs/tutorial/layers/hdf5output.md
new file mode 100644
index 00000000000..cfbe4ddb771
--- /dev/null
+++ b/docs/tutorial/layers/hdf5output.md
@@ -0,0 +1,25 @@
+---
+title: HDF5 Output Layer
+---
+
+# HDF5 Output Layer
+
+* Layer type: `HDF5Output`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5OutputLayer.html)
+* Header: [`./include/caffe/layers/hdf5_output_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_output_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_output_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_output_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cu)
+
+The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
+
+## Parameters
+
+* Parameters (`HDF5OutputParameter hdf5_output_param`)
+    - Required
+        - `file_name`: name of file to write to
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5OutputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hingeloss.md b/docs/tutorial/layers/hingeloss.md
new file mode 100644
index 00000000000..ef4fd95e29d
--- /dev/null
+++ b/docs/tutorial/layers/hingeloss.md
@@ -0,0 +1,19 @@
+---
+title: Hinge Loss Layer
+---
+
+# Hinge (L1, L2) Loss Layer
+
+* Layer type: `HingeLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HingeLossLayer.html)
+* Header: [`./include/caffe/layers/hinge_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hinge_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hinge_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hinge_loss_layer.cpp)
+
+## Parameters
+
+* Parameters (`HingeLossParameter hinge_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HingeLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/im2col.md b/docs/tutorial/layers/im2col.md
new file mode 100644
index 00000000000..0badc1cdd93
--- /dev/null
+++ b/docs/tutorial/layers/im2col.md
@@ -0,0 +1,16 @@
+---
+title: Im2col Layer
+---
+
+# im2col
+
+* File type: `Im2col`
+* Header: [`./include/caffe/layers/im2col_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/im2col_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/im2col_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/im2col_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cu)
+
+`Im2col` is a helper for doing the image-to-column transformation that you most
+likely do not need to know about. This is used in Caffe's original convolution
+to do matrix multiplication by laying out all patches into a matrix.
+
+
diff --git a/docs/tutorial/layers/imagedata.md b/docs/tutorial/layers/imagedata.md
new file mode 100644
index 00000000000..82c8a600be3
--- /dev/null
+++ b/docs/tutorial/layers/imagedata.md
@@ -0,0 +1,27 @@
+---
+title: ImageData Layer
+---
+
+# ImageData Layer
+
+* Layer type: `ImageData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ImageDataLayer.html)
+* Header: [`./include/caffe/layers/image_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/image_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/image_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/image_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`ImageDataParameter image_data_parameter`)
+    - Required
+        - `source`: name of a text file, with each line giving an image filename and label
+        - `batch_size`: number of images to batch together
+    - Optional
+        - `rand_skip`
+        - `shuffle` [default false]
+        - `new_height`, `new_width`: if provided, resize all images to this size
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ImageDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/infogainloss.md b/docs/tutorial/layers/infogainloss.md
new file mode 100644
index 00000000000..b3b690d2621
--- /dev/null
+++ b/docs/tutorial/layers/infogainloss.md
@@ -0,0 +1,23 @@
+---
+title: Infogain Loss Layer
+---
+
+# Infogain Loss Layer
+
+* Layer type: `InfogainLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InfogainLossLayer.html)
+* Header: [`./include/caffe/layers/infogain_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/infogain_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/infogain_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cpp)
+
+A generalization of [MultinomialLogisticLossLayer](multinomiallogisticloss.html) that takes an "information gain" (infogain) matrix specifying the "value" of all label pairs.
+
+Equivalent to the [MultinomialLogisticLossLayer](multinomiallogisticloss.html) if the infogain matrix is the identity.
+
+## Parameters
+
+* Parameters (`Parameter infogain_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InfogainLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/innerproduct.md b/docs/tutorial/layers/innerproduct.md
new file mode 100644
index 00000000000..98b9bea81f5
--- /dev/null
+++ b/docs/tutorial/layers/innerproduct.md
@@ -0,0 +1,59 @@
+---
+title: Inner Product / Fully Connected Layer
+---
+
+# Inner Product / Fully Connected Layer
+
+* Layer type: `InnerProduct`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InnerProductLayer.html)
+* Header: [`./include/caffe/layers/inner_product_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/inner_product_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/inner_product_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/inner_product_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cu)
+
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * 1 * 1`
+* Sample
+
+      layer {
+        name: "fc8"
+        type: "InnerProduct"
+        # learning rate and decay multipliers for the weights
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        inner_product_param {
+          num_output: 1000
+          weight_filler {
+            type: "gaussian"
+            std: 0.01
+          }
+          bias_filler {
+            type: "constant"
+            value: 0
+          }
+        }
+        bottom: "fc7"
+        top: "fc8"
+      }
+
+The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
+
+
+## Parameters
+
+* Parameters (`InnerProductParameter inner_product_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+    - Strongly recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_filler` [default `type: 'constant' value: 0`]
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InnerProductParameter.txt %}
+{% endhighlight %}
+ 
diff --git a/docs/tutorial/layers/input.md b/docs/tutorial/layers/input.md
new file mode 100644
index 00000000000..b74c35d2fb5
--- /dev/null
+++ b/docs/tutorial/layers/input.md
@@ -0,0 +1,19 @@
+---
+title: Input Layer
+---
+
+# Input Layer
+
+* Layer type: `Input`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InputLayer.html)
+* Header: [`./include/caffe/layers/input_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/input_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/input_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/input_layer.cpp)
+
+## Parameters
+
+* Parameters (`InputParameter input_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/InputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/log.md b/docs/tutorial/layers/log.md
new file mode 100644
index 00000000000..df52037489c
--- /dev/null
+++ b/docs/tutorial/layers/log.md
@@ -0,0 +1,20 @@
+---
+title: Log Layer
+---
+
+# Log Layer
+
+* Layer type: `Log`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LogLayer.html)
+* Header: [`./include/caffe/layers/log_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/log_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/log_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/log_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter log_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LogParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
new file mode 100644
index 00000000000..f5e4829279d
--- /dev/null
+++ b/docs/tutorial/layers/lrn.md
@@ -0,0 +1,28 @@
+---
+title: Local Response Normalization (LRN)
+---
+
+# Local Response Normalization (LRN)
+
+* Layer type: `LRN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LRNLayer.html)
+* Header: [`./include/caffe/layers/lrn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lrn_layer.hpp)
+* CPU Implementation: [`./src/caffe/layers/lrn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cpp)
+* CUDA GPU Implementation: [`./src/caffe/layers/lrn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cu)
+* Parameters (`LRNParameter lrn_param`)
+    - Optional
+        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
+        - `alpha` [default 1]: the scaling parameter (see below)
+        - `beta` [default 5]: the exponent (see below)
+        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locations (`WITHIN_CHANNEL`)
+
+The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
+
+## Parameters
+
+* Parameters (`LRNParameter lrn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LRNParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lstm.md b/docs/tutorial/layers/lstm.md
new file mode 100644
index 00000000000..8e4095e950b
--- /dev/null
+++ b/docs/tutorial/layers/lstm.md
@@ -0,0 +1,21 @@
+---
+title: LSTM Layer
+---
+
+# LSTM Layer
+
+* Layer type: `LSTM`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LSTMLayer.html)
+* Header: [`./include/caffe/layers/lstm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lstm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/lstm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_layer.cpp)
+* CPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cpp)
+* CUDA GPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/memorydata.md b/docs/tutorial/layers/memorydata.md
new file mode 100644
index 00000000000..afce4a24a28
--- /dev/null
+++ b/docs/tutorial/layers/memorydata.md
@@ -0,0 +1,25 @@
+---
+title: Memory Data Layer
+---
+
+# Memory Data Layer
+
+* Layer type: `MemoryData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MemoryDataLayer.html)
+* Header: [`./include/caffe/layers/memory_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/memory_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/memory_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/memory_data_layer.cpp)
+
+The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+
+# Parameters
+
+* Parameters (`MemoryDataParameter memory_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MemoryDataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
diff --git a/docs/tutorial/layers/multinomiallogisticloss.md b/docs/tutorial/layers/multinomiallogisticloss.md
new file mode 100644
index 00000000000..5eab74a8a69
--- /dev/null
+++ b/docs/tutorial/layers/multinomiallogisticloss.md
@@ -0,0 +1,19 @@
+---
+title: Multinomial Logistic Loss Layer
+---
+
+# Multinomial Logistic Loss Layer
+
+* Layer type: `MultinomialLogisticLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MultinomialLogisticLossLayer.html)
+* Header: [`./include/caffe/layers/multinomial_logistic_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/multinomial_logistic_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/multinomial_logistic_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/multinomial_logistic_loss_layer.cpp)
+
+## Parameters
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/mvn.md b/docs/tutorial/layers/mvn.md
new file mode 100644
index 00000000000..08e44887d22
--- /dev/null
+++ b/docs/tutorial/layers/mvn.md
@@ -0,0 +1,20 @@
+---
+title: Mean-Variance Normalization (MVN) Layer
+---
+
+# Mean-Variance Normalization (MVN) Layer
+
+* Layer type: `MVN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MVNLayer.html)
+* Header: [`./include/caffe/layers/mvn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/mvn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/mvn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/mvn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cu)
+
+## Parameters
+
+* Parameters (`MVNParameter mvn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MVNParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/parameter.md b/docs/tutorial/layers/parameter.md
new file mode 100644
index 00000000000..b7e85ec5c9a
--- /dev/null
+++ b/docs/tutorial/layers/parameter.md
@@ -0,0 +1,21 @@
+---
+title: Parameter Layer
+---
+
+# Parameter Layer
+
+* Layer type: `Parameter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ParameterLayer.html)
+* Header: [`./include/caffe/layers/parameter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/parameter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/parameter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/parameter_layer.cpp)
+
+See [https://github.com/BVLC/caffe/pull/2079](https://github.com/BVLC/caffe/pull/2079).
+
+## Parameters
+
+* Parameters (`ParameterParameter parameter_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ParameterParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/pooling.md b/docs/tutorial/layers/pooling.md
new file mode 100644
index 00000000000..12669ee8d45
--- /dev/null
+++ b/docs/tutorial/layers/pooling.md
@@ -0,0 +1,47 @@
+---
+title: Pooling Layer
+---
+# Pooling
+
+* Layer type: `Pooling`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PoolingLayer.html)
+* Header: [`./include/caffe/layers/pooling_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/pooling_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/pooling_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/pooling_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+
+* Input
+    - `n * c * h_i * w_i`
+* Output
+    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
+
+## Parameters
+
+* Parameters (`PoolingParameter pooling_param`)
+    - Required
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Optional
+        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PoolingParameter.txt %}
+{% endhighlight %}
+
+## Sample
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "pool1"
+        type: "Pooling"
+        bottom: "conv1"
+        top: "pool1"
+        pooling_param {
+          pool: MAX
+          kernel_size: 3 # pool over a 3x3 region
+          stride: 2      # step two pixels (in the bottom blob) between pooling regions
+        }
+      }
diff --git a/docs/tutorial/layers/power.md b/docs/tutorial/layers/power.md
new file mode 100644
index 00000000000..d6617529b7d
--- /dev/null
+++ b/docs/tutorial/layers/power.md
@@ -0,0 +1,46 @@
+---
+title: Power Layer
+---
+
+# Power Layer
+
+* Layer type: `Power`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PowerLayer.html)
+* Header: [`./include/caffe/layers/power_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/power_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/power_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/power_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cu)
+
+The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
+
+## Parameters
+* Parameters (`PowerParameter power_param`)
+    - Optional
+        - `power` [default 1]
+        - `scale` [default 1]
+        - `shift` [default 0]
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PowerParameter.txt %}
+{% endhighlight %}
+ 
+ 
+ 
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "Power"
+        power_param {
+          power: 1
+          scale: 1
+          shift: 0
+        }
+      }
+
+## See also
+
+* [Exponential layer](exp.html)
diff --git a/docs/tutorial/layers/prelu.md b/docs/tutorial/layers/prelu.md
new file mode 100644
index 00000000000..e7b7b44acb6
--- /dev/null
+++ b/docs/tutorial/layers/prelu.md
@@ -0,0 +1,20 @@
+---
+title: PReLU Layer
+---
+
+# PReLU Layer
+
+* Layer type: `PReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PReLULayer.html)
+* Header: [`./include/caffe/layers/prelu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/prelu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/prelu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/prelu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cu)
+
+## Parameters
+
+* Parameters (`PReLUParameter prelu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/python.md b/docs/tutorial/layers/python.md
new file mode 100644
index 00000000000..2e30b3a79f6
--- /dev/null
+++ b/docs/tutorial/layers/python.md
@@ -0,0 +1,27 @@
+---
+title: Python Layer
+---
+
+# Python Layer
+
+* Layer type: `Python`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PythonLayer.html)
+* Header: [`./include/caffe/layers/python_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/python_layer.hpp)
+
+The Python layer allows users to add customized layers without modifying the Caffe core code.
+
+## Parameters
+
+* Parameters (`PythonParameter python_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PythonParameter.txt %}
+{% endhighlight %}
+
+## Examples and tutorials
+
+* Simple Euclidean loss example
+** [Python code](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/layers/pyloss.py)
+** [Prototxt](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/linreg.prototxt)
+* [Tutorial for writing Python layers with DIGITS](https://github.com/NVIDIA/DIGITS/tree/master/examples/python-layer)
diff --git a/docs/tutorial/layers/recurrent.md b/docs/tutorial/layers/recurrent.md
new file mode 100644
index 00000000000..a882b722f8a
--- /dev/null
+++ b/docs/tutorial/layers/recurrent.md
@@ -0,0 +1,20 @@
+---
+title: Recurrent Layer
+---
+
+# Recurrent Layer
+
+* Layer type: `Recurrent`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RecurrentLayer.html)
+* Header: [`./include/caffe/layers/recurrent_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/recurrent_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/recurrent_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/recurrent_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cu)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reduction.md b/docs/tutorial/layers/reduction.md
new file mode 100644
index 00000000000..db55414b0af
--- /dev/null
+++ b/docs/tutorial/layers/reduction.md
@@ -0,0 +1,20 @@
+---
+title: Reduction Layer
+---
+
+# Reduction Layer
+
+* Layer type: `Reduction`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReductionLayer.html)
+* Header: [`./include/caffe/layers/reduction_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reduction_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/reduction_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/reduction_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cu)
+
+## Parameters
+
+* Parameters (`ReductionParameter reduction_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReductionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/relu.md b/docs/tutorial/layers/relu.md
new file mode 100644
index 00000000000..01aab0af4fa
--- /dev/null
+++ b/docs/tutorial/layers/relu.md
@@ -0,0 +1,32 @@
+---
+title: ReLU / Rectified-Linear and Leaky-ReLU Layer
+---
+
+# ReLU / Rectified-Linear and Leaky-ReLU Layer
+
+* Layer type: `ReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReLULayer.html)
+* Header: [`./include/caffe/layers/relu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/relu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/relu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/relu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cu)
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "relu1"
+        type: "ReLU"
+        bottom: "conv1"
+        top: "conv1"
+      }
+
+Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
+
+## Parameters
+
+* Parameters (`ReLUParameter relu_param`)
+    - Optional
+        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reshape.md b/docs/tutorial/layers/reshape.md
new file mode 100644
index 00000000000..92d23f2c73e
--- /dev/null
+++ b/docs/tutorial/layers/reshape.md
@@ -0,0 +1,51 @@
+---
+title: Reshape Layer
+---
+
+# Reshape Layer
+* Layer type: `Reshape`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReshapeLayer.html)
+* Header: [`./include/caffe/layers/reshape_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reshape_layer.hpp)
+* Implementation: [`./src/caffe/layers/reshape_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reshape_layer.cpp)
+
+* Input
+    - a single blob with arbitrary dimensions
+* Output
+    - the same blob, with modified dimensions, as specified by `reshape_param`
+
+* Sample
+
+        layer {
+          name: "reshape"
+          type: "Reshape"
+          bottom: "input"
+          top: "output"
+          reshape_param {
+            shape {
+              dim: 0  # copy the dimension from below
+              dim: 2
+              dim: 3
+              dim: -1 # infer it from the other dimensions
+            }
+          }
+        }
+
+The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
+
+Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
+
+* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
+* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
+
+As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
+ 
+## Parameters
+
+* Parameters (`ReshapeParameter reshape_param`)
+    - Optional: (also see detailed description below)
+        - `shape`
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReshapeParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/rnn.md b/docs/tutorial/layers/rnn.md
new file mode 100644
index 00000000000..b6fcf47133f
--- /dev/null
+++ b/docs/tutorial/layers/rnn.md
@@ -0,0 +1,19 @@
+---
+title: RNN Layer
+---
+
+# RNN Layer
+
+* Layer type: `RNN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RNNLayer.html)
+* Header: [`./include/caffe/layers/rnn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/rnn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/rnn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/rnn_layer.cpp)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/scale.md b/docs/tutorial/layers/scale.md
new file mode 100644
index 00000000000..0e27549ad52
--- /dev/null
+++ b/docs/tutorial/layers/scale.md
@@ -0,0 +1,20 @@
+---
+title: Scale Layer
+---
+
+# Scale Layer
+
+* Layer type: `Scale`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ScaleLayer.html)
+* Header: [`./include/caffe/layers/scale_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/scale_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/scale_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/scale_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cu)
+
+## Parameters
+
+* Parameters (`ScaleParameter scale_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ScaleParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
new file mode 100644
index 00000000000..f18ac4b84ec
--- /dev/null
+++ b/docs/tutorial/layers/sigmoid.md
@@ -0,0 +1,30 @@
+---
+title: Sigmoid Layer
+---
+
+# Sigmoid Layer
+
+* Layer type: `Sigmoid`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+* Example (from [`./examples/mnist/mnist_autoencoder.prototxt`](https://github.com/BVLC/caffe/blob/master/examples/mnist/mnist_autoencoder.prototxt)):
+
+      layer {
+        name: "encode1neuron"
+        bottom: "encode1"
+        top: "encode1neuron"
+        type: "Sigmoid"
+      }
+
+The `Sigmoid` layer computes `sigmoid(x)` for each element `x` in the bottom blob.
+
+## Parameters
+
+* Parameters (`SigmoidParameter sigmoid_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SigmoidParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoidcrossentropyloss.md b/docs/tutorial/layers/sigmoidcrossentropyloss.md
new file mode 100644
index 00000000000..a6e42cadfa9
--- /dev/null
+++ b/docs/tutorial/layers/sigmoidcrossentropyloss.md
@@ -0,0 +1,13 @@
+---
+title: Sigmoid Cross-Entropy Loss Layer
+---
+
+# Sigmoid Cross-Entropy Loss Layer
+
+* Layer type: `SigmoidCrossEntropyLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidCrossEntropyLossLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu)
+
+To-do.
diff --git a/docs/tutorial/layers/silence.md b/docs/tutorial/layers/silence.md
new file mode 100644
index 00000000000..8b4579a9935
--- /dev/null
+++ b/docs/tutorial/layers/silence.md
@@ -0,0 +1,17 @@
+---
+title: Silence Layer
+---
+
+# Silence Layer
+
+* Layer type: `Silence`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SilenceLayer.html)
+* Header: [`./include/caffe/layers/silence_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/silence_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/silence_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/silence_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cu)
+
+Silences a blob, so that it is not printed.
+
+## Parameters
+
+No parameters.
diff --git a/docs/tutorial/layers/slice.md b/docs/tutorial/layers/slice.md
new file mode 100644
index 00000000000..a492f1e82b9
--- /dev/null
+++ b/docs/tutorial/layers/slice.md
@@ -0,0 +1,42 @@
+---
+title: Slice Layer
+---
+
+# Slice Layer
+
+* Layer type: `Slice`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SliceLayer.html)
+* Header: [`./include/caffe/layers/slice_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/slice_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/slice_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/slice_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cu)
+
+The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
+
+* Sample
+
+      layer {
+        name: "slicer_label"
+        type: "Slice"
+        bottom: "label"
+        ## Example of label with a shape N x 3 x 1 x 1
+        top: "label1"
+        top: "label2"
+        top: "label3"
+        slice_param {
+          axis: 1
+          slice_point: 1
+          slice_point: 2
+        }
+      }
+
+`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
+
+## Parameters
+
+* Parameters (`SliceParameter slice_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SliceParameter.txt %}
+{% endhighlight %}
+
diff --git a/docs/tutorial/layers/softmax.md b/docs/tutorial/layers/softmax.md
new file mode 100644
index 00000000000..e5d53425141
--- /dev/null
+++ b/docs/tutorial/layers/softmax.md
@@ -0,0 +1,24 @@
+---
+title: Softmax Layer
+---
+
+# Softmax Layer
+
+* Layer type: `Softmax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxLayer.html)
+* Header: [`./include/caffe/layers/softmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cu)
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax loss layer](softmaxwithloss.html)
diff --git a/docs/tutorial/layers/softmaxwithloss.md b/docs/tutorial/layers/softmaxwithloss.md
new file mode 100644
index 00000000000..d9a6774a0ed
--- /dev/null
+++ b/docs/tutorial/layers/softmaxwithloss.md
@@ -0,0 +1,33 @@
+---
+title: Softmax with Loss Layer
+---
+
+# Softmax with Loss Layer
+
+* Layer type: `SoftmaxWithLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxWithLossLayer.html)
+* Header: [`./include/caffe/layers/softmax_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cu)
+
+The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax layer](softmax.html)
diff --git a/docs/tutorial/layers/split.md b/docs/tutorial/layers/split.md
new file mode 100644
index 00000000000..4fb71d1f26b
--- /dev/null
+++ b/docs/tutorial/layers/split.md
@@ -0,0 +1,17 @@
+---
+title: Split Layer
+---
+
+# Split Layer
+
+* Layer type: `Split`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SplitLayer.html)
+* Header: [`./include/caffe/layers/split_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/split_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/split_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/split_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cu)
+
+The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/spp.md b/docs/tutorial/layers/spp.md
new file mode 100644
index 00000000000..26e5862023e
--- /dev/null
+++ b/docs/tutorial/layers/spp.md
@@ -0,0 +1,20 @@
+---
+title: Spatial Pyramid Pooling Layer
+---
+
+# Spatial Pyramid Pooling Layer
+
+* Layer type: `SPP`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SPPLayer.html)
+* Header: [`./include/caffe/layers/spp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/spp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/spp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/spp_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`SPPParameter spp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SPPParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tanh.md b/docs/tutorial/layers/tanh.md
new file mode 100644
index 00000000000..360634596f9
--- /dev/null
+++ b/docs/tutorial/layers/tanh.md
@@ -0,0 +1,18 @@
+---
+title: TanH Layer
+---
+
+# TanH Layer
+
+* Header: [`./include/caffe/layers/tanh_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tanh_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tanh_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tanh_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cu)
+
+## Parameters
+
+* Parameters (`TanHParameter tanh_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TanHParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/threshold.md b/docs/tutorial/layers/threshold.md
new file mode 100644
index 00000000000..819e9e6f96d
--- /dev/null
+++ b/docs/tutorial/layers/threshold.md
@@ -0,0 +1,18 @@
+---
+title: Threshold Layer
+---
+
+# Threshold Layer
+
+* Header: [`./include/caffe/layers/threshold_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/threshold_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/threshold_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/threshold_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cu)
+
+## Parameters
+
+* Parameters (`ThresholdParameter threshold_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ThresholdParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tile.md b/docs/tutorial/layers/tile.md
new file mode 100644
index 00000000000..ea03aaa43af
--- /dev/null
+++ b/docs/tutorial/layers/tile.md
@@ -0,0 +1,20 @@
+---
+title: Tile Layer
+---
+
+# Tile Layer
+
+* Layer type: `Tile`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1TileLayer.html)
+* Header: [`./include/caffe/layers/tile_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tile_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tile_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tile_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cu)
+
+## Parameters
+
+* Parameters (`TileParameter tile_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TileParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/windowdata.md b/docs/tutorial/layers/windowdata.md
new file mode 100644
index 00000000000..0cb4a8dfeb7
--- /dev/null
+++ b/docs/tutorial/layers/windowdata.md
@@ -0,0 +1,19 @@
+---
+title: WindowData Layer
+---
+
+# WindowData Layer
+
+* Layer type: `WindowData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1WindowDataLayer.html)
+* Header: [`./include/caffe/layers/window_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/window_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/window_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/window_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`WindowDataParameter`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/WindowDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/solver.md b/docs/tutorial/solver.md
index b719f715a4b..81c626386a2 100644
--- a/docs/tutorial/solver.md
+++ b/docs/tutorial/solver.md
@@ -209,18 +209,11 @@ What distinguishes the method from SGD is the weight setting $$ W $$ on which we
 The **RMSprop** (`type: "RMSProp"`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
 
 $$
-(v_t)_i =
-\begin{cases}
-(v_{t-1})_i + \delta, &(\nabla L(W_t))_i(\nabla L(W_{t-1}))_i > 0\\
-(v_{t-1})_i \cdot (1-\delta), & \text{else}
-\end{cases}
+\operatorname{MS}((W_t)_i)= \delta\operatorname{MS}((W_{t-1})_i)+ (1-\delta)(\nabla L(W_t))_i^2 \\
+(W_{t+1})_i= (W_{t})_i -\alpha\frac{(\nabla L(W_t))_i}{\sqrt{\operatorname{MS}((W_t)_i)}}
 $$
 
-$$
-(W_{t+1})_i =(W_t)_i - \alpha (v_t)_i,
-$$
-
-If the gradient updates results in oscillations the gradient is reduced by times $$1-\delta$$. Otherwise it will be increased by $$\delta$$. The default value of $$\delta$$ (`rms_decay`) is set to $$\delta = 0.02$$.
+The default value of $$\delta$$ (`rms_decay`) is set to $$\delta=0.99$$.
 
 [1] T. Tieleman, and G. Hinton.
     [RMSProp: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index 07ca8df4d74..422259de424 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -70,7 +70,7 @@
     "\n",
     "- `get_ilsvrc_aux.sh` to download the ImageNet data mean, labels, etc.\n",
     "- `download_model_binary.py` to download the pretrained reference model\n",
-    "- `finetune_flickr_style/assemble_data.py` downloadsd the style training and testing data\n",
+    "- `finetune_flickr_style/assemble_data.py` downloads the style training and testing data\n",
     "\n",
     "We'll download just a small subset of the full dataset for this exercise: just 2000 of the 80K images, from 5 of the 20 style categories.  (To download the full dataset, set `full_dataset = True` in the cell below.)"
    ]
@@ -146,7 +146,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "weights = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'\n",
+    "weights = os.path.join(caffe_root, 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')\n",
     "assert os.path.exists(weights)"
    ]
   },
@@ -1141,7 +1141,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "So we did finetuning and it is awesome. Let's take a look at what kind of results we are able to get with a longer, more complete run of the style recognition dataset. Note: the below URL might be occassionally down because it is run on a research machine.\n",
+    "So we did finetuning and it is awesome. Let's take a look at what kind of results we are able to get with a longer, more complete run of the style recognition dataset. Note: the below URL might be occasionally down because it is run on a research machine.\n",
     "\n",
     "http://demo.vislab.berkeleyvision.org/"
    ]
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 663d7360b7d..43bbcb83789 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,11 +19,12 @@ foreach(source_file ${examples_srcs})
   caffe_set_solution_folder(${name} examples)
 
   # install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
-    # TODO: remove in future as soon as naming is standartaized everywhere
+    # TODO: remove in future as soon as naming is standardized everywhere
     set(__outname ${PROJECT_BINARY_DIR}/examples/${folder}/${name}${Caffe_POSTFIX})
     add_custom_command(TARGET ${name} POST_BUILD
                        COMMAND ln -sf "${__outname}" "${__outname}.bin")
diff --git a/examples/brewing-logreg.ipynb b/examples/brewing-logreg.ipynb
index c053b73b39f..0f87185a35b 100644
--- a/examples/brewing-logreg.ipynb
+++ b/examples/brewing-logreg.ipynb
@@ -73,12 +73,12 @@
     ")\n",
     "\n",
     "# Split into train and test\n",
-    "X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)\n",
+    "X, Xt, y, yt = sklearn.model_selection.train_test_split(X, y)\n",
     "\n",
     "# Visualize sample of the data\n",
     "ind = np.random.permutation(X.shape[0])[:1000]\n",
     "df = pd.DataFrame(X[ind])\n",
-    "_ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
+    "_ = pd.plotting.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
    ]
   },
   {
@@ -111,7 +111,7 @@
     "%%timeit\n",
     "# Train and test the scikit-learn SGD logistic regression.\n",
     "clf = sklearn.linear_model.SGDClassifier(\n",
-    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='auto')\n",
+    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='balanced')\n",
     "\n",
     "clf.fit(X, y)\n",
     "yt_pred = clf.predict(Xt)\n",
diff --git a/examples/cifar10/cifar10_full_sigmoid_solver.prototxt b/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
index 7dd3ecb9d8e..a8e5539937d 100644
--- a/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
+++ b/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
@@ -17,7 +17,7 @@ momentum: 0.9
 lr_policy: "step"
 gamma: 1
 stepsize: 5000
-# Display every 200 iterations
+# Display every 100 iterations
 display: 100
 # The maximum number of iterations
 max_iter: 60000
diff --git a/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt b/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
index a57b280fd1e..a4dabd67ca0 100644
--- a/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
+++ b/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
@@ -17,7 +17,7 @@ momentum: 0.9
 lr_policy: "step"
 gamma: 1
 stepsize: 5000
-# Display every 200 iterations
+# Display every 100 iterations
 display: 100
 # The maximum number of iterations
 max_iter: 60000
diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 5de276f722f..14b4401ba16 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,7 +20,6 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
-snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/convert_cifar_data.cpp b/examples/cifar10/convert_cifar_data.cpp
index e1b89f42fb6..7385a74a679 100644
--- a/examples/cifar10/convert_cifar_data.cpp
+++ b/examples/cifar10/convert_cifar_data.cpp
@@ -91,6 +91,8 @@ void convert_dataset(const string& input_folder, const string& output_folder,
 }
 
 int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = 1;
+
   if (argc != 4) {
     printf("This script converts the CIFAR dataset to the leveldb format used\n"
            "by caffe to perform classification.\n"
diff --git a/examples/cifar10/create_cifar10.sh b/examples/cifar10/create_cifar10.sh
index a42725cb610..7ee1d6ad0a0 100755
--- a/examples/cifar10/create_cifar10.sh
+++ b/examples/cifar10/create_cifar10.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the cifar data into leveldb format.
+set -e
 
 EXAMPLE=examples/cifar10
 DATA=data/cifar10
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index ef112e1f6db..fe46e60d795 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -1,16 +1,17 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_solver.prototxt $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate $@
diff --git a/examples/cifar10/train_full_sigmoid.sh b/examples/cifar10/train_full_sigmoid.sh
index 9cff06d3e34..9b5d5213b2a 100755
--- a/examples/cifar10/train_full_sigmoid.sh
+++ b/examples/cifar10/train_full_sigmoid.sh
@@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt $@
 
diff --git a/examples/cifar10/train_full_sigmoid_bn.sh b/examples/cifar10/train_full_sigmoid_bn.sh
index 011387c996e..05547f3a104 100755
--- a/examples/cifar10/train_full_sigmoid_bn.sh
+++ b/examples/cifar10/train_full_sigmoid_bn.sh
@@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt $@
 
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index 6b7d228879b..257479e0d77 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -1,11 +1,12 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-  --solver=examples/cifar10/cifar10_quick_solver.prototxt
+  --solver=examples/cifar10/cifar10_quick_solver.prototxt $@
 
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate $@
diff --git a/examples/cpp_classification/readme.md b/examples/cpp_classification/readme.md
index a086db1a035..4f683aa623f 100644
--- a/examples/cpp_classification/readme.md
+++ b/examples/cpp_classification/readme.md
@@ -10,7 +10,7 @@ priority: 10
 
 Caffe, at its core, is written in C++. It is possible to use the C++
 API of Caffe to implement an image classification application similar
-to the Python code presented in one of the Notebook example. To look
+to the Python code presented in one of the Notebook examples. To look
 at a more general-purpose example of the Caffe C++ API, you should
 study the source code of the command line tool `caffe` in `tools/caffe.cpp`.
 
@@ -19,7 +19,7 @@ study the source code of the command line tool `caffe` in `tools/caffe.cpp`.
 A simple C++ code is proposed in
 `examples/cpp_classification/classification.cpp`. For the sake of
 simplicity, this example does not support oversampling of a single
-sample nor batching of multiple independant samples. This example is
+sample nor batching of multiple independent samples. This example is
 not trying to reach the maximum possible classification throughput on
 a system, but special care was given to avoid unnecessary
 pessimization while keeping the code readable.
@@ -42,7 +42,7 @@ script:
 The ImageNet labels file (also called the *synset file*) is also
 required in order to map a prediction to the name of the class:
 ```
-./data/ilsvrc12/get_ilsvrc_aux.sh.
+./data/ilsvrc12/get_ilsvrc_aux.sh
 ```
 Using the files that were downloaded, we can classify the provided cat
 image (`examples/images/cat.jpg`) using this command:
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 9ba4c9217ff..dacfd01c8e1 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -9,7 +9,7 @@ priority: 5
 # Fine-tuning CaffeNet for Style Recognition on "Flickr Style" Data
 
 Fine-tuning takes an already learned model, adapts the architecture, and resumes training from the already learned model weights.
-Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
+Let's fine-tune the BAIR-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
 
 ## Explanation
 
@@ -57,7 +57,11 @@ The prototxts in this example assume this, and also assume the presence of the I
 
 We'll also need the ImageNet-trained model, which you can obtain by running `./scripts/download_model_binary.py models/bvlc_reference_caffenet`.
 
-Now we can train! (You can fine-tune in CPU mode by leaving out the `-gpu` flag.)
+Now we can train! The key to fine-tuning is the `-weights` argument in the
+command below, which tells Caffe that we want to load weights from a pre-trained
+Caffe model.
+
+(You can fine-tune in CPU mode by leaving out the `-gpu` flag.)
 
     caffe % ./build/tools/caffe train -solver models/finetune_flickr_style/solver.prototxt -weights models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel -gpu 0
 
diff --git a/examples/imagenet/create_imagenet.sh b/examples/imagenet/create_imagenet.sh
index e912ac43cd7..1bf08b1aa8f 100755
--- a/examples/imagenet/create_imagenet.sh
+++ b/examples/imagenet/create_imagenet.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # Create the imagenet lmdb inputs
 # N.B. set the path to the imagenet train + val data dirs
+set -e
 
 EXAMPLE=examples/imagenet
 DATA=data/ilsvrc12
diff --git a/examples/imagenet/resume_training.sh b/examples/imagenet/resume_training.sh
index bf7945c0fd0..4aef204368e 100755
--- a/examples/imagenet/resume_training.sh
+++ b/examples/imagenet/resume_training.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
     --solver=models/bvlc_reference_caffenet/solver.prototxt \
-    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5
+    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5 \
+    $@
diff --git a/examples/imagenet/train_caffenet.sh b/examples/imagenet/train_caffenet.sh
index 94558ec5466..a5094d44ae0 100755
--- a/examples/imagenet/train_caffenet.sh
+++ b/examples/imagenet/train_caffenet.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-    --solver=models/bvlc_reference_caffenet/solver.prototxt
+    --solver=models/bvlc_reference_caffenet/solver.prototxt $@
diff --git a/examples/images/cat gray.jpg b/examples/images/cat gray.jpg
new file mode 100644
index 00000000000..43c5ce37716
Binary files /dev/null and b/examples/images/cat gray.jpg differ
diff --git a/examples/mnist/convert_mnist_data.cpp b/examples/mnist/convert_mnist_data.cpp
index 16d28093dd5..57ddef77074 100644
--- a/examples/mnist/convert_mnist_data.cpp
+++ b/examples/mnist/convert_mnist_data.cpp
@@ -22,12 +22,15 @@
 #include <fstream>  // NOLINT(readability/streams)
 #include <string>
 
+#include "boost/scoped_ptr.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/db.hpp"
 #include "caffe/util/format.hpp"
 
 #if defined(USE_LEVELDB) && defined(USE_LMDB)
 
 using namespace caffe;  // NOLINT(build/namespaces)
+using boost::scoped_ptr;
 using std::string;
 
 DEFINE_string(backend, "lmdb", "The backend for storing the result");
@@ -67,43 +70,10 @@ void convert_dataset(const char* image_filename, const char* label_filename,
   image_file.read(reinterpret_cast<char*>(&cols), 4);
   cols = swap_endian(cols);
 
-  // lmdb
-  MDB_env *mdb_env;
-  MDB_dbi mdb_dbi;
-  MDB_val mdb_key, mdb_data;
-  MDB_txn *mdb_txn;
-  // leveldb
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.error_if_exists = true;
-  options.create_if_missing = true;
-  options.write_buffer_size = 268435456;
-  leveldb::WriteBatch* batch = NULL;
-
-  // Open db
-  if (db_backend == "leveldb") {  // leveldb
-    LOG(INFO) << "Opening leveldb " << db_path;
-    leveldb::Status status = leveldb::DB::Open(
-        options, db_path, &db);
-    CHECK(status.ok()) << "Failed to open leveldb " << db_path
-        << ". Is it already existing?";
-    batch = new leveldb::WriteBatch();
-  } else if (db_backend == "lmdb") {  // lmdb
-    LOG(INFO) << "Opening lmdb " << db_path;
-    CHECK_EQ(mkdir(db_path, 0744), 0)
-        << "mkdir " << db_path << "failed";
-    CHECK_EQ(mdb_env_create(&mdb_env), MDB_SUCCESS) << "mdb_env_create failed";
-    CHECK_EQ(mdb_env_set_mapsize(mdb_env, 1099511627776), MDB_SUCCESS)  // 1TB
-        << "mdb_env_set_mapsize failed";
-    CHECK_EQ(mdb_env_open(mdb_env, db_path, 0, 0664), MDB_SUCCESS)
-        << "mdb_env_open failed";
-    CHECK_EQ(mdb_txn_begin(mdb_env, NULL, 0, &mdb_txn), MDB_SUCCESS)
-        << "mdb_txn_begin failed";
-    CHECK_EQ(mdb_open(mdb_txn, NULL, 0, &mdb_dbi), MDB_SUCCESS)
-        << "mdb_open failed. Does the lmdb already exist? ";
-  } else {
-    LOG(FATAL) << "Unknown db backend " << db_backend;
-  }
+
+  scoped_ptr<db::DB> db(db::GetDB(db_backend));
+  db->Open(db_path, db::NEW);
+  scoped_ptr<db::Transaction> txn(db->NewTransaction());
 
   // Storing to db
   char label;
@@ -125,52 +95,19 @@ void convert_dataset(const char* image_filename, const char* label_filename,
     string key_str = caffe::format_int(item_id, 8);
     datum.SerializeToString(&value);
 
-    // Put in db
-    if (db_backend == "leveldb") {  // leveldb
-      batch->Put(key_str, value);
-    } else if (db_backend == "lmdb") {  // lmdb
-      mdb_data.mv_size = value.size();
-      mdb_data.mv_data = reinterpret_cast<void*>(&value[0]);
-      mdb_key.mv_size = key_str.size();
-      mdb_key.mv_data = reinterpret_cast<void*>(&key_str[0]);
-      CHECK_EQ(mdb_put(mdb_txn, mdb_dbi, &mdb_key, &mdb_data, 0), MDB_SUCCESS)
-          << "mdb_put failed";
-    } else {
-      LOG(FATAL) << "Unknown db backend " << db_backend;
-    }
+    txn->Put(key_str, value);
 
     if (++count % 1000 == 0) {
-      // Commit txn
-      if (db_backend == "leveldb") {  // leveldb
-        db->Write(leveldb::WriteOptions(), batch);
-        delete batch;
-        batch = new leveldb::WriteBatch();
-      } else if (db_backend == "lmdb") {  // lmdb
-        CHECK_EQ(mdb_txn_commit(mdb_txn), MDB_SUCCESS)
-            << "mdb_txn_commit failed";
-        CHECK_EQ(mdb_txn_begin(mdb_env, NULL, 0, &mdb_txn), MDB_SUCCESS)
-            << "mdb_txn_begin failed";
-      } else {
-        LOG(FATAL) << "Unknown db backend " << db_backend;
-      }
+      txn->Commit();
     }
   }
   // write the last batch
   if (count % 1000 != 0) {
-    if (db_backend == "leveldb") {  // leveldb
-      db->Write(leveldb::WriteOptions(), batch);
-      delete batch;
-      delete db;
-    } else if (db_backend == "lmdb") {  // lmdb
-      CHECK_EQ(mdb_txn_commit(mdb_txn), MDB_SUCCESS) << "mdb_txn_commit failed";
-      mdb_close(mdb_env, mdb_dbi);
-      mdb_env_close(mdb_env);
-    } else {
-      LOG(FATAL) << "Unknown db backend " << db_backend;
-    }
-    LOG(ERROR) << "Processed " << count << " files.";
+      txn->Commit();
   }
+  LOG(INFO) << "Processed " << count << " files.";
   delete[] pixels;
+  db->Close();
 }
 
 int main(int argc, char** argv) {
@@ -178,6 +115,8 @@ int main(int argc, char** argv) {
   namespace gflags = google;
 #endif
 
+  FLAGS_alsologtostderr = 1;
+
   gflags::SetUsageMessage("This script converts the MNIST dataset to\n"
         "the lmdb/leveldb format used by Caffe to load data.\n"
         "Usage:\n"
diff --git a/examples/mnist/create_mnist.sh b/examples/mnist/create_mnist.sh
index 06ecc27de63..f5e2e7960c5 100755
--- a/examples/mnist/create_mnist.sh
+++ b/examples/mnist/create_mnist.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into lmdb/leveldb format,
 # depending on the value assigned to $BACKEND.
+set -e
 
 EXAMPLE=examples/mnist
 DATA=data/mnist
diff --git a/examples/mnist/readme.md b/examples/mnist/readme.md
index b87a0f53c7a..35952155a30 100644
--- a/examples/mnist/readme.md
+++ b/examples/mnist/readme.md
@@ -248,7 +248,7 @@ These messages tell you the details about each layer, its connections and its ou
     I1203 solver.cpp:36] Solver scaffolding done.
     I1203 solver.cpp:44] Solving LeNet
 
-Based on the solver setting, we will print the training loss function every 100 iterations, and test the network every 1000 iterations. You will see messages like this:
+Based on the solver setting, we will print the training loss function every 100 iterations, and test the network every 500 iterations. You will see messages like this:
 
     I1203 solver.cpp:204] Iteration 100, lr = 0.00992565
     I1203 solver.cpp:66] Iteration 100, loss = 0.26044
diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1b6bf7d978d..f7f9b86198d 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt $@
diff --git a/examples/mnist/train_lenet_adam.sh b/examples/mnist/train_lenet_adam.sh
index a32ecf2d9c2..7b4e905681b 100755
--- a/examples/mnist/train_lenet_adam.sh
+++ b/examples/mnist/train_lenet_adam.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt $@
diff --git a/examples/mnist/train_lenet_consolidated.sh b/examples/mnist/train_lenet_consolidated.sh
index c855467897e..c5f02666822 100755
--- a/examples/mnist/train_lenet_consolidated.sh
+++ b/examples/mnist/train_lenet_consolidated.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/lenet_consolidated_solver.prototxt
+  --solver=examples/mnist/lenet_consolidated_solver.prototxt $@
diff --git a/examples/mnist/train_lenet_docker.sh b/examples/mnist/train_lenet_docker.sh
index 32cf1c8e4a3..e946ba0f4ad 100755
--- a/examples/mnist/train_lenet_docker.sh
+++ b/examples/mnist/train_lenet_docker.sh
@@ -25,7 +25,7 @@ set -e
 # executed.
 #
 # In order to provide additional flexibility, the following shell (environment)
-# variables can be used to controll the execution of each of the phases:
+# variables can be used to control the execution of each of the phases:
 #
 # DOWNLOAD_DATA: Enable (1) or disable (0) the downloading of the MNIST dataset
 # CREATE_LMDB: Enable (1) or disable (0) the creation of the LMDB database
diff --git a/examples/mnist/train_lenet_rmsprop.sh b/examples/mnist/train_lenet_rmsprop.sh
index 621cab238bf..adfa7ab0fca 100755
--- a/examples/mnist/train_lenet_rmsprop.sh
+++ b/examples/mnist/train_lenet_rmsprop.sh
@@ -1,3 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver_rmsprop.prototxt
+./build/tools/caffe train \
+    --solver=examples/mnist/lenet_solver_rmsprop.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder.sh b/examples/mnist/train_mnist_autoencoder.sh
index cfd67e82fda..724a0f14a49 100755
--- a/examples/mnist/train_mnist_autoencoder.sh
+++ b/examples/mnist/train_mnist_autoencoder.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_adadelta.sh b/examples/mnist/train_mnist_autoencoder_adadelta.sh
index 4be0ebddedc..a660dbb9ed2 100755
--- a/examples/mnist/train_mnist_autoencoder_adadelta.sh
+++ b/examples/mnist/train_mnist_autoencoder_adadelta.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_adagrad.sh b/examples/mnist/train_mnist_autoencoder_adagrad.sh
index 95fe1b17bd5..4c11dfa67ac 100755
--- a/examples/mnist/train_mnist_autoencoder_adagrad.sh
+++ b/examples/mnist/train_mnist_autoencoder_adagrad.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_nesterov.sh b/examples/mnist/train_mnist_autoencoder_nesterov.sh
index cf19ea749b3..fd0559d2488 100755
--- a/examples/mnist/train_mnist_autoencoder_nesterov.sh
+++ b/examples/mnist/train_mnist_autoencoder_nesterov.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt $@
diff --git a/examples/net_surgery.ipynb b/examples/net_surgery.ipynb
index a6092db0c40..217c2d1a742 100644
--- a/examples/net_surgery.ipynb
+++ b/examples/net_surgery.ipynb
@@ -22,7 +22,6 @@
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
-    "import Image\n",
     "\n",
     "# Make sure that caffe is on the python path:\n",
     "caffe_root = '../'  # this file is expected to be in {caffe_root}/examples\n",
@@ -3511,7 +3510,7 @@
     "print(\"blobs {}\\nparams {}\".format(net.blobs.keys(), net.params.keys()))\n",
     "\n",
     "# load image and prepare as a single input batch for Caffe\n",
-    "im = np.array(Image.open('images/cat_gray.jpg'))\n",
+    "im = np.array(caffe.io.load_image('images/cat_gray.jpg', color=False)).squeeze()\n",
     "plt.title(\"original image\")\n",
     "plt.imshow(im)\n",
     "plt.axis('off')\n",
@@ -4480,8 +4479,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "pre-surgery output mean -12.93\n",
-      "post-surgery output mean -11.93\n"
+      "pre-surgery output mean -0.02\n",
+      "post-surgery output mean 0.98\n"
      ]
     }
    ],
@@ -4489,7 +4488,7 @@
     "# pick first filter output\n",
     "conv0 = net.blobs['conv'].data[0, 0]\n",
     "print(\"pre-surgery output mean {:.2f}\".format(conv0.mean()))\n",
-    "# set first filter bias to 10\n",
+    "# set first filter bias to 1\n",
     "net.params['conv'][1].data[0] = 1.\n",
     "net.forward()\n",
     "print(\"post-surgery output mean {:.2f}\".format(conv0.mean()))"
@@ -5480,7 +5479,7 @@
     "\n",
     "Let's take the standard Caffe Reference ImageNet model \"CaffeNet\" and transform it into a fully convolutional net for efficient, dense inference on large inputs. This model generates a classification map that covers a given input size instead of a single classification. In particular a 8 $\\times$ 8 classification map on a 451 $\\times$ 451 input gives 64x the output in only 3x the time. The computation exploits a natural efficiency of convolutional network (convnet) structure by amortizing the computation of overlapping receptive fields.\n",
     "\n",
-    "To do so we translate the `InnerProduct` matrix multiplication layers of CaffeNet into `Convolutional` layers. This is the only change: the other layer types are agnostic to spatial size. Convolution is translation-invariant, activations are elementwise operations, and so on. The `fc6` inner product when carried out as convolution by `fc6-conv` turns into a 6 \\times 6 filter with stride 1 on `pool5`. Back in image space this gives a classification for each 227 $\\times$ 227 box with stride 32 in pixels. Remember the equation for output map / receptive field size, output = (input - kernel_size) / stride + 1, and work out the indexing details for a clear understanding."
+    "To do so we translate the `InnerProduct` matrix multiplication layers of CaffeNet into `Convolutional` layers. This is the only change: the other layer types are agnostic to spatial size. Convolution is translation-invariant, activations are elementwise operations, and so on. The `fc6` inner product when carried out as convolution by `fc6-conv` turns into a 6 $\\times$ 6 filter with stride 1 on `pool5`. Back in image space this gives a classification for each 227 $\\times$ 227 box with stride 32 in pixels. Remember the equation for output map / receptive field size, output = (input - kernel_size) / stride + 1, and work out the indexing details for a clear understanding."
    ]
   },
   {
@@ -5494,13 +5493,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1,2c1,2\r\n",
+      "1,2c1\r\n",
       "< # Fully convolutional network version of CaffeNet.\r\n",
       "< name: \"CaffeNetConv\"\r\n",
       "---\r\n",
       "> name: \"CaffeNet\"\r\n",
-      "> input: \"data\"\r\n",
-      "7,11c7\r\n",
+      "7,11c6\r\n",
       "<   input_param {\r\n",
       "<     # initial shape for a fully convolutional network:\r\n",
       "<     # the shape can be set for each input by reshape.\r\n",
@@ -5508,33 +5506,33 @@
       "<   }\r\n",
       "---\r\n",
       ">   input_param { shape: { dim: 10 dim: 3 dim: 227 dim: 227 } }\r\n",
-      "157,158c153,154\r\n",
+      "157,158c152,153\r\n",
       "<   name: \"fc6-conv\"\r\n",
       "<   type: \"Convolution\"\r\n",
       "---\r\n",
       ">   name: \"fc6\"\r\n",
       ">   type: \"InnerProduct\"\r\n",
-      "160,161c156,157\r\n",
+      "160,161c155,156\r\n",
       "<   top: \"fc6-conv\"\r\n",
       "<   convolution_param {\r\n",
       "---\r\n",
       ">   top: \"fc6\"\r\n",
       ">   inner_product_param {\r\n",
-      "163d158\r\n",
+      "163d157\r\n",
       "<     kernel_size: 6\r\n",
-      "169,170c164,165\r\n",
+      "169,170c163,164\r\n",
       "<   bottom: \"fc6-conv\"\r\n",
       "<   top: \"fc6-conv\"\r\n",
       "---\r\n",
       ">   bottom: \"fc6\"\r\n",
       ">   top: \"fc6\"\r\n",
-      "175,176c170,171\r\n",
+      "175,176c169,170\r\n",
       "<   bottom: \"fc6-conv\"\r\n",
       "<   top: \"fc6-conv\"\r\n",
       "---\r\n",
       ">   bottom: \"fc6\"\r\n",
       ">   top: \"fc6\"\r\n",
-      "182,186c177,181\r\n",
+      "182,186c176,180\r\n",
       "<   name: \"fc7-conv\"\r\n",
       "<   type: \"Convolution\"\r\n",
       "<   bottom: \"fc6-conv\"\r\n",
@@ -5546,21 +5544,21 @@
       ">   bottom: \"fc6\"\r\n",
       ">   top: \"fc7\"\r\n",
       ">   inner_product_param {\r\n",
-      "188d182\r\n",
+      "188d181\r\n",
       "<     kernel_size: 1\r\n",
-      "194,195c188,189\r\n",
+      "194,195c187,188\r\n",
       "<   bottom: \"fc7-conv\"\r\n",
       "<   top: \"fc7-conv\"\r\n",
       "---\r\n",
       ">   bottom: \"fc7\"\r\n",
       ">   top: \"fc7\"\r\n",
-      "200,201c194,195\r\n",
+      "200,201c193,194\r\n",
       "<   bottom: \"fc7-conv\"\r\n",
       "<   top: \"fc7-conv\"\r\n",
       "---\r\n",
       ">   bottom: \"fc7\"\r\n",
       ">   top: \"fc7\"\r\n",
-      "207,211c201,205\r\n",
+      "207,211c200,204\r\n",
       "<   name: \"fc8-conv\"\r\n",
       "<   type: \"Convolution\"\r\n",
       "<   bottom: \"fc7-conv\"\r\n",
@@ -5572,9 +5570,9 @@
       ">   bottom: \"fc7\"\r\n",
       ">   top: \"fc8\"\r\n",
       ">   inner_product_param {\r\n",
-      "213d206\r\n",
+      "213d205\r\n",
       "<     kernel_size: 1\r\n",
-      "219c212\r\n",
+      "219c211\r\n",
       "<   bottom: \"fc8-conv\"\r\n",
       "---\r\n",
       ">   bottom: \"fc8\"\r\n"
@@ -5610,13 +5608,6 @@
     }
    ],
    "source": [
-    "# Make sure that caffe is on the python path:\n",
-    "caffe_root = '../'  # this file is expected to be in {caffe_root}/examples\n",
-    "import sys\n",
-    "sys.path.insert(0, caffe_root + 'python')\n",
-    "\n",
-    "import caffe\n",
-    "\n",
     "# Load the original network and extract the fully connected layers' parameters.\n",
     "net = caffe.Net('../models/bvlc_reference_caffenet/deploy.prototxt', \n",
     "                '../models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel', \n",
diff --git a/examples/pycaffe/layers/pascal_multilabel_datalayers.py b/examples/pycaffe/layers/pascal_multilabel_datalayers.py
index 68e4fa7960a..9420cb328ce 100644
--- a/examples/pycaffe/layers/pascal_multilabel_datalayers.py
+++ b/examples/pycaffe/layers/pascal_multilabel_datalayers.py
@@ -20,7 +20,7 @@
 class PascalMultilabelDataLayerSync(caffe.Layer):
 
     """
-    This is a simple syncronous datalayer for training a multilabel model on
+    This is a simple synchronous datalayer for training a multilabel model on
     PASCAL.
     """
 
@@ -33,7 +33,7 @@ def setup(self, bottom, top):
         # params is a python dictionary with layer parameters.
         params = eval(self.param_str)
 
-        # Check the paramameters for validity.
+        # Check the parameters for validity.
         check_params(params)
 
         # store input as class variables
@@ -207,7 +207,7 @@ def check_params(params):
 
 def print_info(name, params):
     """
-    Ouput some info regarding the class
+    Output some info regarding the class
     """
     print "{} initialized for split: {}, with bs: {}, im_shape: {}.".format(
         name,
diff --git a/examples/pycaffe/tools.py b/examples/pycaffe/tools.py
index 88b1834af1e..7f6c2d835fb 100644
--- a/examples/pycaffe/tools.py
+++ b/examples/pycaffe/tools.py
@@ -26,7 +26,7 @@ def set_scale(self, scale):
 
     def preprocess(self, im):
         """
-        preprocess() emulate the pre-processing occuring in the vgg16 caffe
+        preprocess() emulate the pre-processing occurring in the vgg16 caffe
         prototxt.
         """
 
@@ -75,7 +75,7 @@ def __init__(self, testnet_prototxt_path="testnet.prototxt",
         # looks:
         self.sp['display'] = '25'
         self.sp['snapshot'] = '2500'
-        self.sp['snapshot_prefix'] = '"snapshot"'  # string withing a string!
+        self.sp['snapshot_prefix'] = '"snapshot"'  # string within a string!
 
         # learning rate policy
         self.sp['lr_policy'] = '"fixed"'
diff --git a/examples/siamese/create_mnist_siamese.sh b/examples/siamese/create_mnist_siamese.sh
index 43ad6b184a7..03adce54d9b 100755
--- a/examples/siamese/create_mnist_siamese.sh
+++ b/examples/siamese/create_mnist_siamese.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into leveldb format.
+set -e
 
 EXAMPLES=./build/examples/siamese
 DATA=./data/mnist
diff --git a/examples/siamese/train_mnist_siamese.sh b/examples/siamese/train_mnist_siamese.sh
index 84a30a8ac44..e01ac2ceefd 100755
--- a/examples/siamese/train_mnist_siamese.sh
+++ b/examples/siamese/train_mnist_siamese.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
-$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt
+$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt $@
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index af360ac24bd..2f59471c29e 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -220,6 +220,7 @@ class Blob {
   void set_cpu_data(Dtype* data);
   const int* gpu_shape() const;
   const Dtype* gpu_data() const;
+  void set_gpu_data(Dtype* data);
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
   Dtype* mutable_cpu_data();
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 3c6a076ec2f..23c6e59f87e 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -61,10 +61,27 @@ private:\
       const std::vector<bool>& propagate_down, \
       const std::vector<Blob<double>*>& bottom)
 
+#define INSTANTIATE_LAYER_GPU_DECONV(classname) \
+  template void classname<float>::Deconv_gpu( \
+      const std::vector<Blob<float>*>& top, \
+      const std::vector<bool>& propagate_down, \
+      const std::vector<Blob<float>*>& bottom, \
+      int deconv_type); \
+  template void classname<double>::Deconv_gpu( \
+      const std::vector<Blob<double>*>& top, \
+      const std::vector<bool>& propagate_down, \
+      const std::vector<Blob<double>*>& bottom, \
+      int deconv_type)
+
 #define INSTANTIATE_LAYER_GPU_FUNCS(classname) \
   INSTANTIATE_LAYER_GPU_FORWARD(classname); \
   INSTANTIATE_LAYER_GPU_BACKWARD(classname)
 
+#define INSTANTIATE_LAYER_GPU_FUNCS_WITH_DECONV(classname)  \
+  INSTANTIATE_LAYER_GPU_FORWARD(classname); \
+  INSTANTIATE_LAYER_GPU_BACKWARD(classname); \
+  INSTANTIATE_LAYER_GPU_DECONV(classname)
+
 // A simple macro to mark codes that are not implemented, so that when the code
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
@@ -158,11 +175,14 @@ class Caffe {
   // Search from start_id to the highest possible device ordinal,
   // return the ordinal of the first available device.
   static int FindDevice(const int start_id = 0);
-  // Parallel training info
+  // Parallel training
   inline static int solver_count() { return Get().solver_count_; }
   inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-  inline static bool root_solver() { return Get().root_solver_; }
-  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
+  inline static int solver_rank() { return Get().solver_rank_; }
+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
+  inline static bool multiprocess() { return Get().multiprocess_; }
+  inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
+  inline static bool root_solver() { return Get().solver_rank_ == 0; }
 
  protected:
 #ifndef CPU_ONLY
@@ -172,8 +192,11 @@ class Caffe {
   shared_ptr<RNG> random_generator_;
 
   Brew mode_;
+
+  // Parallel training
   int solver_count_;
-  bool root_solver_;
+  int solver_rank_;
+  bool multiprocess_;
 
  private:
   // The private constructor to avoid duplicate instantiation.
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
deleted file mode 100644
index 8ed5542cb8d..00000000000
--- a/include/caffe/data_reader.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CAFFE_DATA_READER_HPP_
-#define CAFFE_DATA_READER_HPP_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/blocking_queue.hpp"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-/**
- * @brief Reads data from a source to queues available to data layers.
- * A single reading thread is created per source, even if multiple solvers
- * are running in parallel, e.g. for multi-GPU training. This makes sure
- * databases are read sequentially, and that each solver accesses a different
- * subset of the database. Data is distributed to solvers in a round-robin
- * way to keep parallel training deterministic.
- */
-class DataReader {
- public:
-  explicit DataReader(const LayerParameter& param);
-  ~DataReader();
-
-  inline BlockingQueue<Datum*>& free() const {
-    return queue_pair_->free_;
-  }
-  inline BlockingQueue<Datum*>& full() const {
-    return queue_pair_->full_;
-  }
-
- protected:
-  // Queue pairs are shared between a body and its readers
-  class QueuePair {
-   public:
-    explicit QueuePair(int size);
-    ~QueuePair();
-
-    BlockingQueue<Datum*> free_;
-    BlockingQueue<Datum*> full_;
-
-  DISABLE_COPY_AND_ASSIGN(QueuePair);
-  };
-
-  // A single body is created per source
-  class Body : public InternalThread {
-   public:
-    explicit Body(const LayerParameter& param);
-    virtual ~Body();
-
-   protected:
-    void InternalThreadEntry();
-    void read_one(db::Cursor* cursor, QueuePair* qp);
-
-    const LayerParameter param_;
-    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
-
-    friend class DataReader;
-
-  DISABLE_COPY_AND_ASSIGN(Body);
-  };
-
-  // A source is uniquely identified by its layer name + path, in case
-  // the same database is read from two different locations in the net.
-  static inline string source_key(const LayerParameter& param) {
-    return param.name() + ":" + param.data_param().source();
-  }
-
-  const shared_ptr<QueuePair> queue_pair_;
-  shared_ptr<Body> body_;
-
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
-
-DISABLE_COPY_AND_ASSIGN(DataReader);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 6a8c5a02892..0ba67665035 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -42,8 +42,8 @@ class InternalThread {
   bool must_stop();
 
  private:
-  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
-      bool root_solver);
+  void entry(int device, Caffe::Brew mode, int rand_seed,
+      int solver_count, int solver_rank, bool multiprocess);
 
   shared_ptr<boost::thread> thread_;
 };
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 10f353f94f9..881d7311503 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -38,7 +38,7 @@ class Layer {
    * layer.
    */
   explicit Layer(const LayerParameter& param)
-    : layer_param_(param), is_shared_(false) {
+    : layer_param_(param) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -66,7 +66,6 @@ class Layer {
    */
   void SetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    InitMutex();
     CheckBlobCounts(bottom, top);
     LayerSetUp(bottom, top);
     Reshape(bottom, top);
@@ -92,30 +91,6 @@ class Layer {
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
 
-  /**
-   * @brief Whether a layer should be shared by multiple nets during data
-   *        parallelism. By default, all layers except for data layers should
-   *        not be shared. data layers should be shared to ensure each worker
-   *        solver access data sequentially during data parallelism.
-   */
-  virtual inline bool ShareInParallel() const { return false; }
-
-  /** @brief Return whether this layer is actually shared by other nets.
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then this function is expected return true.
-   */
-  inline bool IsShared() const { return is_shared_; }
-
-  /** @brief Set whether this layer is actually shared by other nets
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then is_shared should be set true.
-   */
-  inline void SetShared(bool is_shared) {
-    CHECK(ShareInParallel() || !is_shared)
-        << type() << "Layer does not support sharing.";
-    is_shared_ = is_shared;
-  }
-
   /**
    * @brief Adjust the shapes of top blobs and internal buffers to accommodate
    *        the shapes of the bottom blobs.
@@ -176,6 +151,28 @@ class Layer {
       const vector<bool>& propagate_down,
       const vector<Blob<Dtype>*>& bottom);
 
+  /**
+   * @brief Given the top blob deconv info, compute the bottom blob deconv. Similar to Backward.
+   *
+   * The Deconv wrapper calls the relevant device wrapper function
+   * (Deconv_cpu or Deconv_gpu) to compute the bottom blob diffs given the
+   * top blob diffs.
+   *
+   * Your layer should implement Deconv_cpu and Deconv_gpu.
+   * 
+   * Note: By default, Deconv_gpu will just call Backward_gpu, and
+   * Deconv_cpu will just call Backward_cpu. In many cases this
+   * behavior is desired, e.g. for convolution or innerproduct or
+   * pooling layers. If this is not the desired behavior, override
+   * Deconv_cpu AND Deconv_gpu. If only one of Deconv_{cpu,gpu} is
+   * overridden, the other will still defer to Backward_{cpu,gpu},
+   * which will lead to confusing and inconsistent behavior!
+   */
+  inline void Deconv(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom,
+      int deconv_type);
+
   /**
    * @brief Returns the vector of learnable parameter blobs.
    */
@@ -363,6 +360,30 @@ class Layer {
     Backward_cpu(top, propagate_down, bottom);
   }
 
+  /**
+   * @brief Using the CPU device, compute the deconv for the bottom blobs.
+   *        deconv is calclated either using Zeiler et al, 2013 or via "guided backprop"
+   */
+  virtual void Deconv_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom,
+      int deconv_type) {
+    // LOG(WARNING) << "Explicit Deconv_cpu not implemented for " << type() << " yet; falling back to backward_cpu.";
+    Backward_cpu(top, propagate_down, bottom);
+  }
+  /**
+   * @brief Using the GPU device, compute the deconv for the bottom blobs.
+   *        deconv is calclated either using Zeiler et al, 2013 or via "guided backprop"
+   *        Fall back to Deconv_cpu() if unavailable.
+   */
+  virtual void Deconv_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom,
+      int deconv_type) {
+    // LOG(WARNING) << "Explicit Deconv_gpu not implemented for " << type() << " yet; falling back to backward_gpu.";
+    Deconv_cpu(top, propagate_down, bottom, deconv_type);
+  }
+
   /**
    * Called by the parent Layer's SetUp to check that the number of bottom
    * and top Blobs provided as input match the expected numbers specified by
@@ -428,19 +449,6 @@ class Layer {
   }
 
  private:
-  /** Whether this layer is actually shared by other nets*/
-  bool is_shared_;
-
-  /** The mutex for sequential forward if this layer is shared */
-  shared_ptr<boost::mutex> forward_mutex_;
-
-  /** Initialize forward_mutex_ */
-  void InitMutex();
-  /** Lock forward_mutex_ if this layer is shared */
-  void Lock();
-  /** Unlock forward_mutex_ if this layer is shared */
-  void Unlock();
-
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
 
@@ -450,8 +458,6 @@ class Layer {
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  // Lock during forward to ensure sequential forward
-  Lock();
   Dtype loss = 0;
   Reshape(bottom, top);
   switch (Caffe::mode()) {
@@ -482,7 +488,6 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
   default:
     LOG(FATAL) << "Unknown caffe mode.";
   }
-  Unlock();
   return loss;
 }
 
@@ -502,6 +507,23 @@ inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+inline void Layer<Dtype>::Deconv(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom,
+    int deconv_type) {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Deconv_cpu(top, propagate_down, bottom, deconv_type);
+    break;
+  case Caffe::GPU:
+    Deconv_gpu(top, propagate_down, bottom, deconv_type);
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
+}
+
 // Serialize LayerParameter to protocol buffer
 template <typename Dtype>
 void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index f385afccfee..2369c132911 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -1,6 +1,6 @@
 /**
  * @brief A layer factory that allows one to register layers.
- * During runtime, registered layers could be called by passing a LayerParameter
+ * During runtime, registered layers can be called by passing a LayerParameter
  * protobuffer to the CreateLayer function:
  *
  *     LayerRegistry<Dtype>::CreateLayer(param);
diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index fe2adb939e4..a9ad3225149 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -39,7 +39,7 @@ class AccuracyLayer : public Layer<Dtype> {
   // If there are two top blobs, then the second blob will contain
   // accuracies per class.
   virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlos() const { return 2; }
+  virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
   /**
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 2c49b73184b..c8b6998c8f2 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -26,8 +26,6 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
@@ -67,16 +65,14 @@ class BasePrefetchingDataLayer :
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 3;
-
  protected:
   virtual void InternalThreadEntry();
   virtual void load_batch(Batch<Dtype>* batch) = 0;
 
-  Batch<Dtype> prefetch_[PREFETCH_COUNT];
+  vector<shared_ptr<Batch<Dtype> > > prefetch_;
   BlockingQueue<Batch<Dtype>*> prefetch_free_;
   BlockingQueue<Batch<Dtype>*> prefetch_full_;
+  Batch<Dtype>* prefetch_current_;
 
   Blob<Dtype> transformed_data_;
 };
diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index 9b2d5126efb..43f7b28be95 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -13,25 +13,22 @@ namespace caffe {
  * @brief Normalizes the input to have 0-mean and/or unit (1) variance across
  *        the batch.
  *
- * This layer computes Batch Normalization described in [1].  For
- * each channel in the data (i.e. axis 1), it subtracts the mean and divides
- * by the variance, where both statistics are computed across both spatial
- * dimensions and across the different examples in the batch.
+ * This layer computes Batch Normalization as described in [1]. For each channel
+ * in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
+ * where both statistics are computed across both spatial dimensions and across
+ * the different examples in the batch.
  *
- * By default, during training time, the network is computing global mean/
- * variance statistics via a running average, which is then used at test
- * time to allow deterministic outputs for each input.  You can manually
- * toggle whether the network is accumulating or using the statistics via the
- * use_global_stats option.  IMPORTANT: for this feature to work, you MUST
- * set the learning rate to zero for all three parameter blobs, i.e.,
- * param {lr_mult: 0} three times in the layer definition.
+ * By default, during training time, the network is computing global
+ * mean/variance statistics via a running average, which is then used at test
+ * time to allow deterministic outputs for each input. You can manually toggle
+ * whether the network is accumulating or using the statistics via the
+ * use_global_stats option. For reference, these statistics are kept in the
+ * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
  *
  * Note that the original paper also included a per-channel learned bias and
- * scaling factor.  It is possible (though a bit cumbersome) to implement
- * this in caffe using a single-channel DummyDataLayer filled with zeros,
- * followed by a Convolution layer with output the same size as the current.
- * This produces a channel-specific value that can be added or multiplied by
- * the BatchNorm layer's output.
+ * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
+ * with `bias_term: true` after each `BatchNormLayer` to handle both the bias
+ * and scaling factor.
  *
  * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
  *     Training by Reducing Internal Covariate Shift." arXiv preprint
diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp
index eedc3aaa351..9639c9cdc8a 100644
--- a/include/caffe/layers/bias_layer.hpp
+++ b/include/caffe/layers/bias_layer.hpp
@@ -10,13 +10,13 @@
 namespace caffe {
 
 /**
- * @brief Computes a sum of two input Blobs, with the shape of the
- *        latter Blob "broadcast" to match the shape of the former.
- *        Equivalent to tiling the latter Blob, then computing the elementwise
- *        sum.
+ * @brief Computes a sum of two input Blobs, with the shape of the latter Blob
+ *        "broadcast" to match the shape of the former. Equivalent to tiling
+ *        the latter Blob, then computing the elementwise sum.
  *
  * The second input may be omitted, in which case it's learned as a parameter
- * of the layer.
+ * of the layer. Note: in case bias and scaling are desired, both operations can
+ * be handled by `ScaleLayer` configured with `bias_term: true`.
  */
 template <typename Dtype>
 class BiasLayer : public Layer<Dtype> {
diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
index 5c605b2ae9e..5219fa5cb5f 100644
--- a/include/caffe/layers/crop_layer.hpp
+++ b/include/caffe/layers/crop_layer.hpp
@@ -41,18 +41,29 @@ class CropLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  vector<int> offsets;
+  Blob<int> offsets;
+  Blob<int> src_strides_;
+  Blob<int> dest_strides_;
 
  private:
+  // Recursive copy function.
   void crop_copy(const vector<Blob<Dtype>*>& bottom,
                const vector<Blob<Dtype>*>& top,
-               const vector<int>& offsets,
+               const int* offsets,
                vector<int> indices,
                int cur_dim,
                const Dtype* src_data,
                Dtype* dest_data,
                bool is_forward);
 
+  // Recursive copy function: this is similar to crop_copy() but loops over all
+  // but the last two dimensions to allow for ND cropping while still relying on
+  // a CUDA kernel for the innermost two dimensions for performance reasons.  An
+  // alterantive implementation could rely on the kernel more by passing
+  // offsets, but this is problematic because of its variable length.
+  // Since in the standard (N,C,W,H) case N,C are usually not cropped a speedup
+  // could be achieved by not looping the application of the copy_kernel around
+  // these dimensions.
   void crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
                 const vector<Blob<Dtype>*>& top,
                 const vector<int>& offsets,
diff --git a/include/caffe/layers/cudnn_relu_layer.hpp b/include/caffe/layers/cudnn_relu_layer.hpp
index e01f568abc9..a1cb29e7c5f 100644
--- a/include/caffe/layers/cudnn_relu_layer.hpp
+++ b/include/caffe/layers/cudnn_relu_layer.hpp
@@ -37,6 +37,7 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
   cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
diff --git a/include/caffe/layers/cudnn_sigmoid_layer.hpp b/include/caffe/layers/cudnn_sigmoid_layer.hpp
index 9c597958b0b..7b3486f8a7e 100644
--- a/include/caffe/layers/cudnn_sigmoid_layer.hpp
+++ b/include/caffe/layers/cudnn_sigmoid_layer.hpp
@@ -37,6 +37,7 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
   cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
diff --git a/include/caffe/layers/cudnn_tanh_layer.hpp b/include/caffe/layers/cudnn_tanh_layer.hpp
index c0f0053f71e..59e758d7031 100644
--- a/include/caffe/layers/cudnn_tanh_layer.hpp
+++ b/include/caffe/layers/cudnn_tanh_layer.hpp
@@ -37,6 +37,7 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
   cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index 6c361791a0c..667a4ae43a5 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -4,7 +4,6 @@
 #include <vector>
 
 #include "caffe/blob.hpp"
-#include "caffe/data_reader.hpp"
 #include "caffe/data_transformer.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
@@ -21,17 +20,19 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
+  void Next();
+  bool Skip();
   virtual void load_batch(Batch<Dtype>* batch);
 
-  DataReader reader_;
+  shared_ptr<db::DB> db_;
+  shared_ptr<db::Cursor> cursor_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
index 4180f1d01e4..13a63d47ec4 100644
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ b/include/caffe/layers/dummy_data_layer.hpp
@@ -22,8 +22,6 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index b04cf8e1940..601b36c6b89 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -23,12 +23,10 @@ template <typename Dtype>
 class HDF5DataLayer : public Layer<Dtype> {
  public:
   explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
+      : Layer<Dtype>(param), offset_() {}
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
@@ -38,6 +36,9 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
+  void Next();
+  bool Skip();
+
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -55,6 +56,7 @@ class HDF5DataLayer : public Layer<Dtype> {
   std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
   std::vector<unsigned int> data_permutation_;
   std::vector<unsigned int> file_permutation_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
index 487d08fc06c..061e279d7a0 100644
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ b/include/caffe/layers/hdf5_output_layer.hpp
@@ -28,8 +28,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index 633f339a28e..edecde829ad 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -8,6 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
 
 namespace caffe {
 
@@ -60,6 +61,12 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int MaxBottomBlobs() const { return 3; }
 
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
   virtual inline const char* type() const { return "InfogainLoss"; }
 
  protected:
@@ -102,7 +109,35 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
   Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/input_layer.hpp b/include/caffe/layers/input_layer.hpp
index f4472678c69..0ffdc724894 100644
--- a/include/caffe/layers/input_layer.hpp
+++ b/include/caffe/layers/input_layer.hpp
@@ -22,8 +22,6 @@ class InputLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/lrn_layer.hpp b/include/caffe/layers/lrn_layer.hpp
index 06cf71a94cb..840ca76724a 100644
--- a/include/caffe/layers/lrn_layer.hpp
+++ b/include/caffe/layers/lrn_layer.hpp
@@ -43,6 +43,15 @@ class LRNLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  virtual void Deconv_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
+  virtual void Deconv_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
+  virtual void Deconv_passthrough_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
+  virtual void Deconv_passthrough_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
+
   virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -87,6 +96,9 @@ class LRNLayer : public Layer<Dtype> {
   shared_ptr<EltwiseLayer<Dtype> > product_layer_;
   Blob<Dtype> product_input_;
   vector<Blob<Dtype>*> product_bottom_vec_;
+
+  // Fields used for deconv
+  bool deconv_ignore_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/lstm_layer.hpp b/include/caffe/layers/lstm_layer.hpp
new file mode 100644
index 00000000000..a0e67c9d432
--- /dev/null
+++ b/include/caffe/layers/lstm_layer.hpp
@@ -0,0 +1,154 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented by unrolling
+ *        the LSTM computation through time.
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LSTM_LAYER_HPP_
diff --git a/include/caffe/layers/parameter_layer.hpp b/include/caffe/layers/parameter_layer.hpp
new file mode 100644
index 00000000000..188b92acbe2
--- /dev/null
+++ b/include/caffe/layers/parameter_layer.hpp
@@ -0,0 +1,45 @@
+#ifndef CAFFE_PARAMETER_LAYER_HPP_
+#define CAFFE_PARAMETER_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class ParameterLayer : public Layer<Dtype> {
+ public:
+  explicit ParameterLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    if (this->blobs_.size() > 0) {
+      LOG(INFO) << "Skipping parameter initialization";
+    } else {
+      this->blobs_.resize(1);
+      this->blobs_[0].reset(new Blob<Dtype>());
+      this->blobs_[0]->Reshape(this->layer_param_.parameter_param().shape());
+    }
+    top[0]->Reshape(this->layer_param_.parameter_param().shape());
+  }
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) { }
+  virtual inline const char* type() const { return "Parameter"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    top[0]->ShareData(*(this->blobs_[0]));
+    top[0]->ShareDiff(*(this->blobs_[0]));
+  }
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
+  { }
+};
+
+}  // namespace caffe
+
+#endif
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index b839d52684e..1407d9217aa 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -21,11 +21,12 @@ class PythonLayer : public Layer<Dtype> {
     // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
     // Details: https://github.com/BVLC/caffe/issues/2936
     if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !ShareInParallel()) {
-      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
+        && !Caffe::multiprocess()) {
+      LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
     }
     self_.attr("param_str") = bp::str(
         this->layer_param_.python_param().param_str());
+    self_.attr("phase") = static_cast<int>(this->phase_);
     self_.attr("setup")(bottom, top);
   }
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -33,10 +34,6 @@ class PythonLayer : public Layer<Dtype> {
     self_.attr("reshape")(bottom, top);
   }
 
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
   virtual inline const char* type() const { return "Python"; }
 
  protected:
diff --git a/include/caffe/layers/recurrent_layer.hpp b/include/caffe/layers/recurrent_layer.hpp
new file mode 100644
index 00000000000..ca17371b994
--- /dev/null
+++ b/include/caffe/layers/recurrent_layer.hpp
@@ -0,0 +1,187 @@
+#ifndef CAFFE_RECURRENT_LAYER_HPP_
+#define CAFFE_RECURRENT_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/format.hpp"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instead,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const {
+    int min_bottoms = 2;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> inputs;
+      this->RecurrentInputBlobNames(&inputs);
+      min_bottoms += inputs.size();
+    }
+    return min_bottoms;
+  }
+  virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
+  virtual inline int ExactNumTopBlobs() const {
+    int num_tops = 1;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> outputs;
+      this->RecurrentOutputBlobNames(&outputs);
+      num_tops += outputs.size();
+    }
+    return num_tops;
+  }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network architecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  /**
+   * @brief The last layer to run in the network. (Any later layers are losses
+   *        added to force the recurrent net to do backprop.)
+   */
+  int last_layer_index_;
+
+  /**
+   * @brief Whether the layer's hidden state at the first and last timesteps
+   *        are layer inputs and outputs, respectively.
+   */
+  bool expose_hidden_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RECURRENT_LAYER_HPP_
diff --git a/include/caffe/layers/relu_layer.hpp b/include/caffe/layers/relu_layer.hpp
index d7a73f7a8d1..18351c2e661 100644
--- a/include/caffe/layers/relu_layer.hpp
+++ b/include/caffe/layers/relu_layer.hpp
@@ -78,6 +78,11 @@ class ReLULayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  virtual void Deconv_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
+  virtual void Deconv_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/rnn_layer.hpp b/include/caffe/layers/rnn_layer.hpp
new file mode 100644
index 00000000000..6dce238ae17
--- /dev/null
+++ b/include/caffe/layers/rnn_layer.hpp
@@ -0,0 +1,47 @@
+#ifndef CAFFE_RNN_LAYER_HPP_
+#define CAFFE_RNN_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RNN_LAYER_HPP_
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
index 924df2e51ab..45b714d4027 100644
--- a/include/caffe/layers/scale_layer.hpp
+++ b/include/caffe/layers/scale_layer.hpp
@@ -12,13 +12,15 @@
 namespace caffe {
 
 /**
- * @brief Computes a product of two input Blobs, with the shape of the
- *        latter Blob "broadcast" to match the shape of the former.
+ * @brief Computes the elementwise product of two input Blobs, with the shape of
+ *        the latter Blob "broadcast" to match the shape of the former.
  *        Equivalent to tiling the latter Blob, then computing the elementwise
- *        product.
+ *        product. Note: for efficiency and convenience, this layer can
+ *        additionally perform a "broadcast" sum too when `bias_term: true`
+ *        is set.
  *
- * The second input may be omitted, in which case it's learned as a parameter
- * of the layer.
+ * The latter, scale input may be omitted, in which case it's learned as
+ * parameter of the layer (as is the bias, if it is included).
  */
 template <typename Dtype>
 class ScaleLayer: public Layer<Dtype> {
diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index 598dca5ff2c..3d92524421c 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -59,6 +59,8 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   /// @copydoc SigmoidCrossEntropyLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
@@ -95,6 +97,13 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+
   /// The internal SigmoidLayer used to map predictions to probabilities.
   shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
   /// sigmoid_output stores the output of the SigmoidLayer.
@@ -103,6 +112,15 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   vector<Blob<Dtype>*> sigmoid_bottom_vec_;
   /// top vector holder to call the underlying SigmoidLayer::Forward
   vector<Blob<Dtype>*> sigmoid_top_vec_;
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the loss.
+  LossParameter_NormalizationMode normalization_;
+  Dtype normalizer_;
+  int outer_num_, inner_num_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/softmax_layer.hpp b/include/caffe/layers/softmax_layer.hpp
index c65b8703e43..46f57de033b 100644
--- a/include/caffe/layers/softmax_layer.hpp
+++ b/include/caffe/layers/softmax_layer.hpp
@@ -36,6 +36,7 @@ class SoftmaxLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+
   int outer_num_;
   int inner_num_;
   int softmax_axis_;
diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
index 35f41b80e63..b9b66b7cf1d 100644
--- a/include/caffe/layers/window_data_layer.hpp
+++ b/include/caffe/layers/window_data_layer.hpp
@@ -16,7 +16,8 @@ namespace caffe {
 
 /**
  * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
+ *        by a window data file. This layer is *DEPRECATED* and only kept for
+ *        archival purposes for use by the original R-CNN.
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 0addb3c2a6d..b2ed259dbbb 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -23,9 +23,9 @@ namespace caffe {
 template <typename Dtype>
 class Net {
  public:
-  explicit Net(const NetParameter& param, const Net* root_net = NULL);
+  explicit Net(const NetParameter& param);
   explicit Net(const string& param_file, Phase phase,
-      const Net* root_net = NULL);
+      const int level = 0, const vector<string>* stages = NULL);
   virtual ~Net() {}
 
   /// @brief Initialize a network with a NetParameter.
@@ -74,6 +74,14 @@ class Net {
   void BackwardFrom(int start);
   void BackwardTo(int end);
 
+  /**
+   * The network deconv works similarly to backward and also takes no input and output.
+   */
+  void Deconv(int deconv_type);
+  void DeconvFromTo(int start, int end, int deconv_type);
+  void DeconvFrom(int start, int deconv_type);
+  void DeconvTo(int end, int deconv_type);
+
   /**
    * @brief Reshape all layers from bottom to top.
    *
@@ -227,6 +235,31 @@ class Net {
   static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
       const string& layer_name);
 
+  // Invoked at specific points during an iteration
+  class Callback {
+   protected:
+    virtual void run(int layer) = 0;
+
+    template <typename T>
+    friend class Net;
+  };
+  const vector<Callback*>& before_forward() const { return before_forward_; }
+  void add_before_forward(Callback* value) {
+    before_forward_.push_back(value);
+  }
+  const vector<Callback*>& after_forward() const { return after_forward_; }
+  void add_after_forward(Callback* value) {
+    after_forward_.push_back(value);
+  }
+  const vector<Callback*>& before_backward() const { return before_backward_; }
+  void add_before_backward(Callback* value) {
+    before_backward_.push_back(value);
+  }
+  const vector<Callback*>& after_backward() const { return after_backward_; }
+  void add_after_backward(Callback* value) {
+    after_backward_.push_back(value);
+  }
+
  protected:
   // Helpers for Init.
   /// @brief Append a new top blob to the net.
@@ -245,6 +278,8 @@ class Net {
   void ForwardDebugInfo(const int layer_id);
   /// @brief Helper for displaying debug info in Backward.
   void BackwardDebugInfo(const int layer_id);
+  /// @brief Helper for displaying debug info in Deconv.
+  void DeconvDebugInfo(const int layer_id);
   /// @brief Helper for displaying debug info in Update.
   void UpdateDebugInfo(const int param_id);
 
@@ -305,9 +340,13 @@ class Net {
   size_t memory_used_;
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
-  /// The root net that actually holds the shared layers in data parallelism
-  const Net* const root_net_;
-  DISABLE_COPY_AND_ASSIGN(Net);
+  // Callbacks
+  vector<Callback*> before_forward_;
+  vector<Callback*> after_forward_;
+  vector<Callback*> before_backward_;
+  vector<Callback*> after_backward_;
+
+DISABLE_COPY_AND_ASSIGN(Net);
 };
 
 
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index 6c496c884e3..64bb48e6b02 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -1,8 +1,11 @@
 #ifndef CAFFE_PARALLEL_HPP_
 #define CAFFE_PARALLEL_HPP_
 
-#include <boost/date_time/posix_time/posix_time.hpp>
+#ifdef USE_NCCL
 
+#include <boost/thread.hpp>
+
+#include <string>
 #include <vector>
 
 #include "caffe/blob.hpp"
@@ -13,6 +16,7 @@
 #include "caffe/solver.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/blocking_queue.hpp"
+#include "caffe/util/nccl.hpp"
 
 namespace caffe {
 
@@ -51,7 +55,7 @@ class GPUParams : public Params<Dtype> {
   GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
   virtual ~GPUParams();
 
-  void configure(Solver<Dtype>* solver) const;
+  void Configure(Solver<Dtype>* solver) const;
 
  protected:
   using Params<Dtype>::size_;
@@ -59,58 +63,55 @@ class GPUParams : public Params<Dtype> {
   using Params<Dtype>::diff_;
 };
 
-class DevicePair {
- public:
-  DevicePair(int parent, int device)
-      : parent_(parent),
-        device_(device) {
-  }
-  inline int parent() {
-    return parent_;
-  }
-  inline int device() {
-    return device_;
-  }
-
-  // Group GPUs in pairs, by proximity depending on machine's topology
-  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
-
- protected:
-  int parent_;
-  int device_;
-};
-
-// Synchronous data parallelism using map-reduce between local GPUs.
 template<typename Dtype>
-class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
-    public InternalThread {
+class NCCL : public GPUParams<Dtype>,
+             public Solver<Dtype>::Callback,
+             public Net<Dtype>::Callback {
  public:
-  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                   P2PSync<Dtype>* parent, const SolverParameter& param);
-  virtual ~P2PSync();
-
-  inline const shared_ptr<Solver<Dtype> >& solver() const {
-    return solver_;
-  }
-
-  void Run(const vector<int>& gpus);
-  void Prepare(const vector<int>& gpus,
-               vector<shared_ptr<P2PSync<Dtype> > >* syncs);
-  inline const int initial_iter() const { return initial_iter_; }
+  /**
+   * Single process version.
+   */
+  explicit NCCL(shared_ptr<Solver<Dtype> > solver);
+  /**
+   * In multi-process settings, first create a NCCL id (new_uid), then
+   * pass it to each process to create connected instances.
+   */
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid);
+  ~NCCL();
+
+  boost::barrier* barrier();
+  void set_barrier(boost::barrier* value);
+
+  /**
+   * In single process settings, create instances without uids and
+   * call this to connect them.
+   */
+  static void InitSingleProcess(vector<NCCL<Dtype>*>* nccls);
+
+  static string new_uid();
+
+  /**
+   * Broadcast weights from rank 0 other solvers.
+   */
+  void Broadcast();
+
+  /**
+   * Single process multi-GPU.
+   */
+  void Run(const vector<int>& gpus, const char* restore);
 
  protected:
-  void on_start();
+  void Init();
+  void on_start() {}
+  void run(int layer);  // Net callback
   void on_gradients_ready();
 
-  void InternalThreadEntry();
+  ncclComm_t comm_;
+  cudaStream_t stream_;
 
-  P2PSync<Dtype>* parent_;
-  vector<P2PSync<Dtype>*> children_;
-  BlockingQueue<P2PSync<Dtype>*> queue_;
-  const int initial_iter_;
-  Dtype* parent_grads_;
   shared_ptr<Solver<Dtype> > solver_;
-
+  // Should not be necessary, https://github.com/NVIDIA/nccl/issues/37
+  boost::barrier* barrier_;
   using Params<Dtype>::size_;
   using Params<Dtype>::data_;
   using Params<Dtype>::diff_;
@@ -118,4 +119,5 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
 
 }  // namespace caffe
 
-#endif
+#endif  // USE_NCCL
+#endif  // header
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 38259edad9f..a28d8cb897e 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -6,13 +6,14 @@
 
 #include "caffe/net.hpp"
 #include "caffe/solver_factory.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
 /**
   * @brief Enumeration of actions that a client of the Solver may request by
   * implementing the Solver's action request function, which a
-  * a client may optionally provide in order to request early termination
+  * client may optionally provide in order to request early termination
   * or saving a snapshot without exiting. In the executable caffe, this
   * mechanism is used to allow the snapshot to be saved when stopping
   * execution with a SIGINT (Ctrl-C).
@@ -40,9 +41,8 @@ typedef boost::function<SolverAction::Enum()> ActionCallback;
 template <typename Dtype>
 class Solver {
  public:
-  explicit Solver(const SolverParameter& param,
-      const Solver* root_solver = NULL);
-  explicit Solver(const string& param_file, const Solver* root_solver = NULL);
+  explicit Solver(const SolverParameter& param);
+  explicit Solver(const string& param_file);
   void Init(const SolverParameter& param);
   void InitTrainNet();
   void InitTestNets();
@@ -72,7 +72,7 @@ class Solver {
   inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
     return test_nets_;
   }
-  int iter() { return iter_; }
+  int iter() const { return iter_; }
 
   // Invoked at specific points during an iteration
   class Callback {
@@ -118,10 +118,6 @@ class Solver {
   vector<Dtype> losses_;
   Dtype smoothed_loss_;
 
-  // The root solver that holds root nets (actually containing shared layers)
-  // in data parallelism
-  const Solver* const root_solver_;
-
   // A function that can be set by a client of the Solver to provide indication
   // that it wants a snapshot saved and/or to exit early.
   ActionCallback action_request_function_;
@@ -129,31 +125,11 @@ class Solver {
   // True iff a request to stop early was received.
   bool requested_early_exit_;
 
-  DISABLE_COPY_AND_ASSIGN(Solver);
-};
+  // Timing information, handy to tune e.g. nbr of GPUs
+  Timer iteration_timer_;
+  float iterations_last_;
 
-/**
- * @brief Solver that only computes gradients, used as worker
- *        for multi-GPU training.
- */
-template <typename Dtype>
-class WorkerSolver : public Solver<Dtype> {
- public:
-  explicit WorkerSolver(const SolverParameter& param,
-      const Solver<Dtype>* root_solver = NULL)
-      : Solver<Dtype>(param, root_solver) {}
-
- protected:
-  void ApplyUpdate() {}
-  void SnapshotSolverState(const string& model_filename) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromBinaryProto(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromHDF5(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
+  DISABLE_COPY_AND_ASSIGN(Solver);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/solver_factory.hpp b/include/caffe/solver_factory.hpp
index cfff721af40..a5b160739b2 100644
--- a/include/caffe/solver_factory.hpp
+++ b/include/caffe/solver_factory.hpp
@@ -15,7 +15,7 @@
  * and its type is its C++ class name, but without the "Solver" at the end
  * ("MyAwesomeSolver" -> "MyAwesome").
  *
- * If the solver is going to be created simply by its constructor, in your c++
+ * If the solver is going to be created simply by its constructor, in your C++
  * file, add the following line:
  *
  *    REGISTER_SOLVER_CLASS(MyAwesome);
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 38ee4664028..317ce29a257 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -3,6 +3,10 @@
 
 #include <cstdlib>
 
+#ifdef USE_MKL
+  #include "mkl.h"
+#endif
+
 #include "caffe/common.hpp"
 
 namespace caffe {
@@ -20,7 +24,11 @@ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  *ptr = mkl_malloc(size ? size:1, 64);
+#else
   *ptr = malloc(size);
+#endif
   *use_cuda = false;
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
@@ -32,7 +40,11 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  mkl_free(ptr);
+#else
   free(ptr);
+#endif
 }
 
 
@@ -44,14 +56,8 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
  */
 class SyncedMemory {
  public:
-  SyncedMemory()
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
+  SyncedMemory();
+  explicit SyncedMemory(size_t size);
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
@@ -68,6 +74,8 @@ class SyncedMemory {
 #endif
 
  private:
+  void check_device();
+
   void to_cpu();
   void to_gpu();
   void* cpu_ptr_;
@@ -77,7 +85,7 @@ class SyncedMemory {
   bool own_cpu_data_;
   bool cpu_malloc_use_cuda_;
   bool own_gpu_data_;
-  int gpu_device_;
+  int device_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091476..294f7e5011a 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -18,9 +18,8 @@ using std::endl;
   #include "caffe_config.h"
 #else
   #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+  #define ABS_TEST_DATA_DIR "src/caffe/test/test_data"
 #endif
 
 int main(int argc, char** argv);
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index 8a7e17c6cd4..498cfe385de 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -41,6 +41,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
   }
   return "Unknown cudnn status";
 }
@@ -91,8 +95,13 @@ template <typename Dtype>
 inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
     int n, int c, int h, int w) {
   CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      n, c, h, w));
+      CUDNN_TENSOR_NCHW, n, c, h, w));
+#else
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
+      CUDNN_TENSOR_NCHW, n, c, h, w));
+#endif
 }
 
 template <typename Dtype>
@@ -104,8 +113,14 @@ template <typename Dtype>
 inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
     cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
     int pad_h, int pad_w, int stride_h, int stride_w) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION,
+      dataType<Dtype>::type));
+#else
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+#endif
 }
 
 template <typename Dtype>
@@ -123,8 +138,21 @@ inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
     LOG(FATAL) << "Unknown pooling method.";
   }
   CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-        pad_h, pad_w, stride_h, stride_w));
+#if CUDNN_VERSION_MIN(5, 0, 0)
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode,
+        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
+#else
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(*pool_desc, *mode,
+        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
+#endif
+}
+
+template <typename Dtype>
+inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc,
+    cudnnActivationMode_t mode) {
+  CUDNN_CHECK(cudnnCreateActivationDescriptor(activ_desc));
+  CUDNN_CHECK(cudnnSetActivationDescriptor(*activ_desc, mode,
+                                           CUDNN_PROPAGATE_NAN, Dtype(0)));
 }
 
 }  // namespace cudnn
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index e9fa0d32b66..4cdb6db9558 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -14,7 +14,10 @@ namespace caffe { namespace db {
 class LevelDBCursor : public Cursor {
  public:
   explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
+    : iter_(iter) {
+    SeekToFirst();
+    CHECK(iter_->status().ok()) << iter_->status().ToString();
+  }
   ~LevelDBCursor() { delete iter_; }
   virtual void SeekToFirst() { iter_->SeekToFirst(); }
   virtual void Next() { iter_->Next(); }
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index 4e1568ace50..ee370322383 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -3,6 +3,7 @@
 #define CAFFE_UTIL_DB_LMDB_HPP
 
 #include <string>
+#include <vector>
 
 #include "lmdb.h"
 
@@ -54,14 +55,16 @@ class LMDBCursor : public Cursor {
 
 class LMDBTransaction : public Transaction {
  public:
-  explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
-    : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { }
+  explicit LMDBTransaction(MDB_env* mdb_env)
+    : mdb_env_(mdb_env) { }
   virtual void Put(const string& key, const string& value);
-  virtual void Commit() { MDB_CHECK(mdb_txn_commit(mdb_txn_)); }
+  virtual void Commit();
 
  private:
-  MDB_dbi* mdb_dbi_;
-  MDB_txn* mdb_txn_;
+  MDB_env* mdb_env_;
+  vector<string> keys, values;
+
+  void DoubleMapSize();
 
   DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
 };
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index e3fe4fe29fd..a8c8adff332 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -18,6 +18,19 @@ void classname<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, \
     const vector<bool>& propagate_down, \
     const vector<Blob<Dtype>*>& bottom) { NO_GPU; } \
 
+#define STUB_GPU_WITH_DECONV(classname) \
+template <typename Dtype> \
+void classname<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, \
+    const vector<Blob<Dtype>*>& top) { NO_GPU; } \
+template <typename Dtype> \
+void classname<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, \
+    const vector<bool>& propagate_down, \
+    const vector<Blob<Dtype>*>& bottom) { NO_GPU; } \
+template <typename Dtype> \
+void classname<Dtype>::Deconv_gpu(const vector<Blob<Dtype>*>& top, \
+    const vector<bool>& propagate_down, \
+    const vector<Blob<Dtype>*>& bottom, int deconv_type) { NO_GPU; } \
+
 #define STUB_GPU_FORWARD(classname, funcname) \
 template <typename Dtype> \
 void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& bottom, \
@@ -29,6 +42,12 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
     const vector<bool>& propagate_down, \
     const vector<Blob<Dtype>*>& bottom) { NO_GPU; } \
 
+#define STUB_GPU_DECONV(classname, funcname) \
+template <typename Dtype> \
+void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
+    const vector<bool>& propagate_down, \
+    const vector<Blob<Dtype>*>& bottom, int deconv_type) { NO_GPU; } \
+
 #else  // Normal GPU + CPU Caffe.
 
 #include <cublas_v2.h>
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index ce568c5eb0d..71549c1cc02 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -13,12 +13,12 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape);
 
 template <typename Dtype>
 void hdf5_load_nd_dataset(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape = false);
 
 template <typename Dtype>
 void hdf5_save_nd_dataset(
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 6f6d3feeae2..e549120a933 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -52,6 +52,9 @@ void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
+template <typename Dtype>
+void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
+
 template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
@@ -128,16 +131,16 @@ inline int8_t caffe_sign(Dtype val) {
   }
 
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
 
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
 // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
 // and we don't want that to expand here when CUDA headers are also included.
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
+    y[i] = static_cast<bool>((std::signbit)(x[i])))
 
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
 
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
@@ -185,6 +188,11 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cudaStream_t str);
+#endif
+
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
@@ -209,6 +217,9 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y);
+
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
 // [0, UINT_MAX].
 void caffe_gpu_rng_uniform(const int n, unsigned int* r);
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 3355b6658a3..8c2294c7c86 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -7,9 +7,14 @@
 
 #else  // If use MKL, simply include the MKL header
 
+#ifdef USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#else
 extern "C" {
 #include <cblas.h>
 }
+#endif  // USE_ACCELERATE
+
 #include <math.h>
 
 // Functions that caffe uses but are not present if MKL is not linked.
@@ -31,10 +36,11 @@ extern "C" {
     v##name<double>(n, a, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
-DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
-DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]));
-DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
+DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
+DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 
 // A simple way to define the vsl unary functions with singular parameter b.
 // The operation should be in the form e.g. y[i] = pow(a[i], b)
@@ -53,7 +59,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]
@@ -72,10 +78,10 @@ DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
-DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
-DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
-DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 
 // In addition, MKL comes with an additional function axpby that is not present
 // in standard blas. We will simply use a two-step (inefficient, of course) way
diff --git a/include/caffe/util/nccl.hpp b/include/caffe/util/nccl.hpp
new file mode 100644
index 00000000000..e01fb7451e8
--- /dev/null
+++ b/include/caffe/util/nccl.hpp
@@ -0,0 +1,37 @@
+#ifndef CAFFE_UTIL_NCCL_H_
+#define CAFFE_UTIL_NCCL_H_
+#ifdef USE_NCCL
+
+#include <nccl.h>
+
+#include "caffe/common.hpp"
+
+#define NCCL_CHECK(condition) \
+{ \
+  ncclResult_t result = condition; \
+  CHECK_EQ(result, ncclSuccess) << " " \
+    << ncclGetErrorString(result); \
+}
+
+namespace caffe {
+
+namespace nccl {
+
+template <typename Dtype> class dataType;
+
+template<> class dataType<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+template<> class dataType<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+}  // namespace nccl
+
+}  // namespace caffe
+
+#endif  // end USE_NCCL
+
+#endif  // CAFFE_UTIL_NCCL_H_
diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index 14e1936a8c2..b145822af32 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -65,6 +65,12 @@ bool NetNeedsInputUpgrade(const NetParameter& net_param);
 // Perform all necessary transformations to upgrade input fields into layers.
 void UpgradeNetInput(NetParameter* net_param);
 
+// Return true iff the Net contains batch norm layers with manual local LRs.
+bool NetNeedsBatchNormUpgrade(const NetParameter& net_param);
+
+// Perform all necessary transformations to upgrade batch norm layers.
+void UpgradeNetBatchNorm(NetParameter* net_param);
+
 // Return true iff the solver contains any old solver_type specified as enums
 bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param);
 
diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
index e6295bba1a4..bb99ec89049 100644
--- a/matlab/+caffe/Net.m
+++ b/matlab/+caffe/Net.m
@@ -68,6 +68,11 @@
       self.layer_names = self.attributes.layer_names;
       self.blob_names = self.attributes.blob_names;
     end
+    function delete (self)
+      if ~isempty(self.hNet_self)
+        caffe_('delete_net', self.hNet_self);
+      end
+    end
     function layer = layers(self, layer_name)
       CHECK(ischar(layer_name), 'layer_name must be a string');
       layer = self.layer_vec(self.name2layer_index(layer_name));
diff --git a/matlab/+caffe/Solver.m b/matlab/+caffe/Solver.m
index f8bdc4e22b2..2d3c98b2a26 100644
--- a/matlab/+caffe/Solver.m
+++ b/matlab/+caffe/Solver.m
@@ -36,6 +36,9 @@
         self.test_nets(n) = caffe.Net(self.attributes.hNet_test_nets(n));
       end
     end
+    function delete (self)
+      caffe_('delete_solver', self.hSolver_self);
+    end
     function iter = iter(self)
       iter = caffe_('solver_get_iter', self.hSolver_self);
     end
diff --git a/matlab/+caffe/private/caffe_.cpp b/matlab/+caffe/private/caffe_.cpp
index 1b1b2bff861..a32bd5e536d 100644
--- a/matlab/+caffe/private/caffe_.cpp
+++ b/matlab/+caffe/private/caffe_.cpp
@@ -44,7 +44,7 @@ void mxCHECK_FILE_EXIST(const char* file) {
 // The pointers to caffe::Solver and caffe::Net instances
 static vector<shared_ptr<Solver<float> > > solvers_;
 static vector<shared_ptr<Net<float> > > nets_;
-// init_key is generated at the beginning and everytime you call reset
+// init_key is generated at the beginning and every time you call reset
 static double init_key = static_cast<double>(caffe_rng_rand());
 
 /** -----------------------------------------------------------------
@@ -197,6 +197,17 @@ static void get_solver(MEX_ARGS) {
   mxFree(solver_file);
 }
 
+// Usage: caffe_('delete_solver', hSolver)
+static void delete_solver(MEX_ARGS) {
+  mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
+      "Usage: caffe_('delete_solver', hSolver)");
+  Solver<float>* solver = handle_to_ptr<Solver<float> >(prhs[0]);
+  solvers_.erase(std::remove_if(solvers_.begin(), solvers_.end(),
+      [solver] (const shared_ptr< Solver<float> > &solverPtr) {
+      return solverPtr.get() == solver;
+  }), solvers_.end());
+}
+
 // Usage: caffe_('solver_get_attr', hSolver)
 static void solver_get_attr(MEX_ARGS) {
   mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
@@ -271,6 +282,17 @@ static void get_net(MEX_ARGS) {
   mxFree(phase_name);
 }
 
+// Usage: caffe_('delete_solver', hSolver)
+static void delete_net(MEX_ARGS) {
+  mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
+      "Usage: caffe_('delete_solver', hNet)");
+  Net<float>* net = handle_to_ptr<Net<float> >(prhs[0]);
+  nets_.erase(std::remove_if(nets_.begin(), nets_.end(),
+      [net] (const shared_ptr< Net<float> > &netPtr) {
+      return netPtr.get() == net;
+  }), nets_.end());
+}
+
 // Usage: caffe_('net_get_attr', hNet)
 static void net_get_attr(MEX_ARGS) {
   mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
@@ -522,12 +544,14 @@ struct handler_registry {
 static handler_registry handlers[] = {
   // Public API functions
   { "get_solver",         get_solver      },
+  { "delete_solver",      delete_solver   },
   { "solver_get_attr",    solver_get_attr },
   { "solver_get_iter",    solver_get_iter },
   { "solver_restore",     solver_restore  },
   { "solver_solve",       solver_solve    },
   { "solver_step",        solver_step     },
   { "get_net",            get_net         },
+  { "delete_net",         delete_net      },
   { "net_get_attr",       net_get_attr    },
   { "net_forward",        net_forward     },
   { "net_backward",       net_backward    },
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index f420df8d412..987730d9b55 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -20,7 +20,7 @@ if(NOT BUILD_SHARED_LIBS AND build_using MATCHES Matlab)
   message(FATAL_ERROR "Matlab MEX interface (with default mex options file) can only be built if caffe is compiled as shared library. Please enable 'BUILD_SHARED_LIBS' in CMake. Aternativelly you can switch to Octave compiler.")
 endif()
 
-# helper function to set proper mex file extention
+# helper function to set proper mex file extension
 function(caffe_fetch_and_set_proper_mexext mexfile_variable)
   execute_process(COMMAND ${Matlab_mexext} OUTPUT_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE res OUTPUT_VARIABLE ext)
   if(res MATCHES 0)
diff --git a/matlab/demo/classification_demo.m b/matlab/demo/classification_demo.m
index 2b60332970b..435c077845f 100644
--- a/matlab/demo/classification_demo.m
+++ b/matlab/demo/classification_demo.m
@@ -8,7 +8,7 @@
 %
 % ****************************************************************************
 % For detailed documentation and usage on Caffe's Matlab interface, please
-% refer to Caffe Interface Tutorial at
+% refer to the Caffe Interface Tutorial at
 % http://caffe.berkeleyvision.org/tutorial/interfaces.html#matlab
 % ****************************************************************************
 %
@@ -24,6 +24,7 @@
 %  $ export LD_LIBRARY_PATH=/opt/intel/mkl/lib/intel64:/usr/local/cuda-5.5/lib64
 %  $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
 % Or the equivalent based on where things are installed on your system
+% and what versions are installed.
 %
 % Usage:
 %  im = imread('../../examples/images/cat.jpg');
@@ -39,7 +40,7 @@
 % Data coming in from matlab needs to be in the order
 %   [width, height, channels, images]
 % where width is the fastest dimension.
-% Here is the rough matlab for putting image data into the correct
+% Here is the rough matlab code for putting image data into the correct
 % format in W x H x C with BGR channels:
 %   % permute channels from RGB to BGR
 %   im_data = im(:, :, [3, 2, 1]);
@@ -54,7 +55,7 @@
 
 % If you have multiple images, cat them with cat(4, ...)
 
-% Add caffe/matlab to you Matlab search PATH to use matcaffe
+% Add caffe/matlab to your Matlab search PATH in order to use matcaffe
 if exist('../+caffe', 'dir')
   addpath('..');
 else
diff --git a/models/bvlc_alexnet/readme.md b/models/bvlc_alexnet/readme.md
index 008d690f7f4..a83e3d4e27c 100644
--- a/models/bvlc_alexnet/readme.md
+++ b/models/bvlc_alexnet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC AlexNet Model
+name: BAIR/BVLC AlexNet Model
 caffemodel: bvlc_alexnet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/readme.md b/models/bvlc_googlenet/readme.md
index 061b6d74530..ef04db62ab2 100644
--- a/models/bvlc_googlenet/readme.md
+++ b/models/bvlc_googlenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC GoogleNet Model
+name: BAIR/BVLC GoogleNet Model
 caffemodel: bvlc_googlenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/train_val.prototxt b/models/bvlc_googlenet/train_val.prototxt
old mode 100644
new mode 100755
index 5dee3abe28f..5fe367f2263
--- a/models/bvlc_googlenet/train_val.prototxt
+++ b/models/bvlc_googlenet/train_val.prototxt
@@ -1692,7 +1692,7 @@ layer {
   type: "SoftmaxWithLoss"
   bottom: "loss2/classifier"
   bottom: "label"
-  top: "loss2/loss1"
+  top: "loss2/loss2"
   loss_weight: 0.3
 }
 layer {
diff --git a/models/bvlc_reference_caffenet/readme.md b/models/bvlc_reference_caffenet/readme.md
index 671e47a5056..5352e536a07 100644
--- a/models/bvlc_reference_caffenet/readme.md
+++ b/models/bvlc_reference_caffenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC CaffeNet Model
+name: BAIR/BVLC CaffeNet Model
 caffemodel: bvlc_reference_caffenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/readme.md b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
index 9a11a24d8f8..12543b2bd2c 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/readme.md
+++ b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC Reference RCNN ILSVRC13 Model
+name: BAIR/BVLC Reference RCNN ILSVRC13 Model
 caffemodel: bvlc_reference_rcnn_ilsvrc13.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_rcnn_ilsvrc13.caffemodel
 license: unrestricted
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index a22641401f0..c53299d265b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -3,13 +3,13 @@ if(NOT HAVE_PYTHON)
   return()
 endif()
 
-include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
 file(GLOB_RECURSE python_srcs ${PROJECT_SOURCE_DIR}/python/*.cpp)
 
 add_library(pycaffe SHARED ${python_srcs})
-target_link_libraries(pycaffe ${Caffe_LINK} ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
-set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
 caffe_default_properties(pycaffe)
+set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
+target_include_directories(pycaffe PUBLIC ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_link_libraries(pycaffe PUBLIC ${Caffe_LINK} ${PYTHON_LIBRARIES})
 
 if(UNIX OR APPLE)
     set(__linkname "${PROJECT_SOURCE_DIR}/python/caffe/_caffe.so")
@@ -22,13 +22,19 @@ if(UNIX OR APPLE)
 endif()
 
 # ---[ Install
-file(GLOB files1 *.py requirements.txt)
-install(FILES ${files1} DESTINATION python)
-
-file(GLOB files2 caffe/*.py)
-install(FILES  ${files2} DESTINATION python/caffe)
+# scripts
+file(GLOB python_files *.py requirements.txt)
+install(FILES ${python_files} DESTINATION python)
+
+# module
+install(DIRECTORY caffe
+    DESTINATION python
+    FILES_MATCHING
+    PATTERN "*.py"
+    PATTERN "ilsvrc_2012_mean.npy"
+    PATTERN "test" EXCLUDE
+    )
+
+# _caffe.so
 install(TARGETS pycaffe  DESTINATION python/caffe)
-install(DIRECTORY caffe/imagenet caffe/proto caffe/test DESTINATION python/caffe)
-
-
 
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index e2881b89c1b..776945eec88 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
-from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
+from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, has_nccl
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index a2c46a123aa..6bc5bbb5b90 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -26,6 +26,19 @@
 #define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
 #endif
 
+/* Fix to avoid registration warnings in pycaffe (#3960) */
+#define BP_REGISTER_SHARED_PTR_TO_PYTHON(PTR) do { \
+  const boost::python::type_info info = \
+    boost::python::type_id<shared_ptr<PTR > >(); \
+  const boost::python::converter::registration* reg = \
+    boost::python::converter::registry::query(info); \
+  if (reg == NULL) { \
+    bp::register_ptr_to_python<shared_ptr<PTR > >(); \
+  } else if ((*reg).m_to_python == NULL) { \
+    bp::register_ptr_to_python<shared_ptr<PTR > >(); \
+  } \
+} while (0)
+
 namespace bp = boost::python;
 
 namespace caffe {
@@ -38,6 +51,25 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
+void InitLog() {
+  ::google::InitGoogleLogging("");
+  ::google::InstallFailureSignalHandler();
+}
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
+}
+void Log(const string& s) {
+  LOG(INFO) << s;
+}
+
+void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
+
 // For convenience, check that input files can be opened, and raise an
 // exception that boost will send to Python if not (caffe could still crash
 // later if the input files are disturbed before they are actually used, but
@@ -73,19 +105,42 @@ void CheckContiguousArray(PyArrayObject* arr, string name,
   }
 }
 
-// Net constructor for passing phase as int
-shared_ptr<Net<Dtype> > Net_Init(
-    string param_file, int phase) {
-  CheckFile(param_file);
+// Net constructor
+shared_ptr<Net<Dtype> > Net_Init(string network_file, int phase,
+    const int level, const bp::object& stages,
+    const bp::object& weights) {
+  CheckFile(network_file);
+
+  // Convert stages from list to vector
+  vector<string> stages_vector;
+  if (!stages.is_none()) {
+    for (int i = 0; i < len(stages); i++) {
+      stages_vector.push_back(bp::extract<string>(stages[i]));
+    }
+  }
+
+  // Initialize net
+  shared_ptr<Net<Dtype> > net(new Net<Dtype>(network_file,
+        static_cast<Phase>(phase), level, &stages_vector));
+
+  // Load weights
+  if (!weights.is_none()) {
+    std::string weights_file_str = bp::extract<std::string>(weights);
+    CheckFile(weights_file_str);
+    net->CopyTrainedLayersFrom(weights_file_str);
+  }
 
-  shared_ptr<Net<Dtype> > net(new Net<Dtype>(param_file,
-      static_cast<Phase>(phase)));
   return net;
 }
 
-// Net construct-and-load convenience constructor
+// Legacy Net construct-and-load convenience constructor
 shared_ptr<Net<Dtype> > Net_Init_Load(
     string param_file, string pretrained_param_file, int phase) {
+  LOG(WARNING) << "DEPRECATION WARNING - deprecated use of Python interface";
+  LOG(WARNING) << "Use this instead (with the named \"weights\""
+    << " parameter):";
+  LOG(WARNING) << "Net('" << param_file << "', " << phase
+    << ", weights='" << pretrained_param_file << "')";
   CheckFile(param_file);
   CheckFile(pretrained_param_file);
 
@@ -101,6 +156,14 @@ void Net_Save(const Net<Dtype>& net, string filename) {
   WriteProtoToBinaryFile(net_param, filename.c_str());
 }
 
+void Net_SaveHDF5(const Net<Dtype>& net, string filename) {
+  net.ToHDF5(filename);
+}
+
+void Net_LoadHDF5(Net<Dtype>* net, string filename) {
+  net->CopyTrainedLayersFromHDF5(filename.c_str());
+}
+
 void Net_SetInputArrays(Net<Dtype>* net, bp::object data_obj,
     bp::object labels_obj) {
   // check that this network has an input MemoryDataLayer
@@ -207,6 +270,112 @@ bp::object BlobVec_add_blob(bp::tuple args, bp::dict kwargs) {
   return bp::object();
 }
 
+template<typename Dtype>
+class SolverCallback: public Solver<Dtype>::Callback {
+ protected:
+  bp::object on_start_, on_gradients_ready_;
+
+ public:
+  SolverCallback(bp::object on_start, bp::object on_gradients_ready)
+    : on_start_(on_start), on_gradients_ready_(on_gradients_ready) { }
+  virtual void on_gradients_ready() {
+    on_gradients_ready_();
+  }
+  virtual void on_start() {
+    on_start_();
+  }
+};
+template<typename Dtype>
+void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
+  bp::object on_gradients_ready) {
+  solver->add_callback(new SolverCallback<Dtype>(on_start, on_gradients_ready));
+}
+
+// Seems boost cannot call the base method directly
+void Solver_add_nccl(Solver<Dtype>* solver
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  solver->add_callback(nccl);
+#endif
+}
+
+void share_weights(Solver<Dtype>* solver, Net<Dtype>* net) {
+  net->ShareTrainedLayersWith(solver->net().get());
+}
+
+template<typename Dtype>
+class NetCallback: public Net<Dtype>::Callback {
+ public:
+  explicit NetCallback(bp::object run) : run_(run) {}
+
+ protected:
+  virtual void run(int layer) {
+    run_(layer);
+  }
+  bp::object run_;
+};
+void Net_before_forward(Net<Dtype>* net, bp::object run) {
+  net->add_before_forward(new NetCallback<Dtype>(run));
+}
+void Net_after_forward(Net<Dtype>* net, bp::object run) {
+  net->add_after_forward(new NetCallback<Dtype>(run));
+}
+void Net_before_backward(Net<Dtype>* net, bp::object run) {
+  net->add_before_backward(new NetCallback<Dtype>(run));
+}
+void Net_after_backward(Net<Dtype>* net, bp::object run) {
+  net->add_after_backward(new NetCallback<Dtype>(run));
+}
+
+void Net_add_nccl(Net<Dtype>* net
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  net->add_after_backward(nccl);
+#endif
+}
+#ifndef USE_NCCL
+template<typename Dtype>
+class NCCL {
+ public:
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid) {}
+};
+#endif
+
+bool HasNCCL() {
+#ifdef USE_NCCL
+  return true;
+#else
+  return false;
+#endif
+}
+
+#ifdef USE_NCCL
+bp::object NCCL_New_Uid() {
+  std::string uid = NCCL<Dtype>::new_uid();
+#if PY_MAJOR_VERSION >= 3
+  // Convert std::string to bytes so that Python does not
+  // try to decode the string using the current locale.
+
+  // Since boost 1.53 boost.python will convert str and bytes
+  // to std::string but will convert std::string to str. Here we
+  // force a bytes object to be returned. When this object
+  // is passed back to the NCCL constructor boost.python will
+  // correctly convert the bytes to std::string automatically
+  PyObject* py_uid = PyBytes_FromString(uid.c_str());
+  return bp::object(bp::handle<>(py_uid));
+#else
+  // automatic conversion is correct for python 2.
+  return bp::object(uid);
+#endif
+}
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -216,19 +385,37 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::scope().attr("__version__") = AS_STRING(CAFFE_VERSION);
 
   // Caffe utility functions
+  bp::def("init_log", &InitLog);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
+  bp::def("log", &Log);
+  bp::def("has_nccl", &HasNCCL);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
+  bp::def("set_random_seed", &set_random_seed);
   bp::def("set_device", &Caffe::SetDevice);
+  bp::def("solver_count", &Caffe::solver_count);
+  bp::def("set_solver_count", &Caffe::set_solver_count);
+  bp::def("solver_rank", &Caffe::solver_rank);
+  bp::def("set_solver_rank", &Caffe::set_solver_rank);
+  bp::def("set_multiprocess", &Caffe::set_multiprocess);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
 
   bp::class_<Net<Dtype>, shared_ptr<Net<Dtype> >, boost::noncopyable >("Net",
     bp::no_init)
-    .def("__init__", bp::make_constructor(&Net_Init))
+    // Constructor
+    .def("__init__", bp::make_constructor(&Net_Init,
+          bp::default_call_policies(), (bp::arg("network_file"), "phase",
+            bp::arg("level")=0, bp::arg("stages")=bp::object(),
+            bp::arg("weights")=bp::object())))
+    // Legacy constructor
     .def("__init__", bp::make_constructor(&Net_Init_Load))
     .def("_forward", &Net<Dtype>::ForwardFromTo)
     .def("_backward", &Net<Dtype>::BackwardFromTo)
+    .def("_deconv", &Net<Dtype>::DeconvFromTo)
     .def("reshape", &Net<Dtype>::Reshape)
+    .def("clear_param_diffs", &Net<Dtype>::ClearParamDiffs)
     // The cast is to select a particular overload.
     .def("copy_from", static_cast<void (Net<Dtype>::*)(const string)>(
         &Net<Dtype>::CopyTrainedLayersFrom))
@@ -254,8 +441,15 @@ BOOST_PYTHON_MODULE(_caffe) {
         bp::return_value_policy<bp::copy_const_reference>()))
     .def("_set_input_arrays", &Net_SetInputArrays,
         bp::with_custodian_and_ward<1, 2, bp::with_custodian_and_ward<1, 3> >())
-    .def("save", &Net_Save);
-  bp::register_ptr_to_python<shared_ptr<Net<Dtype> > >();
+    .def("save", &Net_Save)
+    .def("save_hdf5", &Net_SaveHDF5)
+    .def("load_hdf5", &Net_LoadHDF5)
+    .def("before_forward", &Net_before_forward)
+    .def("after_forward", &Net_after_forward)
+    .def("before_backward", &Net_before_backward)
+    .def("after_backward", &Net_after_backward)
+    .def("after_backward", &Net_add_nccl);
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Net<Dtype>);
 
   bp::class_<Blob<Dtype>, shared_ptr<Blob<Dtype> >, boost::noncopyable>(
     "Blob", bp::no_init)
@@ -271,11 +465,19 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
     .def("reshape",           bp::raw_function(&Blob_Reshape))
+#ifndef CPU_ONLY
+    .add_property("_gpu_data_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_data))
+    .add_property("_gpu_diff_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_diff))
+#endif
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
           NdarrayCallPolicies()))
     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,
           NdarrayCallPolicies()));
-  bp::register_ptr_to_python<shared_ptr<Blob<Dtype> > >();
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Blob<Dtype>);
 
   bp::class_<Layer<Dtype>, shared_ptr<PythonLayer<Dtype> >,
     boost::noncopyable>("Layer", bp::init<const LayerParameter&>())
@@ -284,8 +486,12 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("setup", &Layer<Dtype>::LayerSetUp)
     .def("reshape", &Layer<Dtype>::Reshape)
     .add_property("type", bp::make_function(&Layer<Dtype>::type));
-  bp::register_ptr_to_python<shared_ptr<Layer<Dtype> > >();
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Layer<Dtype>);
 
+  bp::class_<SolverParameter>("SolverParameter", bp::no_init)
+    .add_property("max_iter", &SolverParameter::max_iter)
+    .add_property("display", &SolverParameter::display)
+    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce);
   bp::class_<LayerParameter>("LayerParameter", bp::no_init);
 
   bp::class_<Solver<Dtype>, shared_ptr<Solver<Dtype> >, boost::noncopyable>(
@@ -294,12 +500,17 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("test_nets", bp::make_function(&Solver<Dtype>::test_nets,
           bp::return_internal_reference<>()))
     .add_property("iter", &Solver<Dtype>::iter)
+    .def("add_callback", &Solver_add_callback<Dtype>)
+    .def("add_callback", &Solver_add_nccl)
     .def("solve", static_cast<void (Solver<Dtype>::*)(const char*)>(
           &Solver<Dtype>::Solve), SolveOverloads())
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
-    .def("snapshot", &Solver<Dtype>::Snapshot);
-  bp::register_ptr_to_python<shared_ptr<Solver<Dtype> > >();
+    .def("snapshot", &Solver<Dtype>::Snapshot)
+    .def("share_weights", &share_weights)
+    .add_property("param", bp::make_function(&Solver<Dtype>::param,
+              bp::return_value_policy<bp::copy_const_reference>()));
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
 
   bp::class_<SGDSolver<Dtype>, bp::bases<Solver<Dtype> >,
     shared_ptr<SGDSolver<Dtype> >, boost::noncopyable>(
@@ -342,6 +553,24 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<vector<bool> >("BoolVec")
     .def(bp::vector_indexing_suite<vector<bool> >());
 
+  bp::class_<NCCL<Dtype>, shared_ptr<NCCL<Dtype> >,
+    boost::noncopyable>("NCCL",
+                        bp::init<shared_ptr<Solver<Dtype> >, const string&>())
+#ifdef USE_NCCL
+    .def("new_uid", NCCL_New_Uid).staticmethod("new_uid")
+    .def("bcast", &NCCL<Dtype>::Broadcast)
+#endif
+    /* NOLINT_NEXT_LINE(whitespace/semicolon) */
+  ;
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(NCCL<Dtype>);
+
+  bp::class_<Timer, shared_ptr<Timer>, boost::noncopyable>(
+    "Timer", bp::init<>())
+    .def("start", &Timer::Start)
+    .def("stop", &Timer::Stop)
+    .add_property("ms", &Timer::MilliSeconds);
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Timer);
+
   // boost python expects a void (missing) return value, while import_array
   // returns NULL for python3. import_array1() forces a void return value.
   import_array1();
diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index 537193db8f8..983760a786d 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -79,6 +79,7 @@ def predict(self, inputs, oversample=True):
                 -self.crop_dims / 2.0,
                 self.crop_dims / 2.0
             ])
+            crop = crop.astype(int)
             input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
 
         # Classify
@@ -91,7 +92,7 @@ def predict(self, inputs, oversample=True):
 
         # For oversampling, average predictions across crops.
         if oversample:
-            predictions = predictions.reshape((len(predictions) / 10, 10, -1))
+            predictions = predictions.reshape((len(predictions) // 10, 10, -1))
             predictions = predictions.mean(1)
 
         return predictions
diff --git a/python/caffe/detector.py b/python/caffe/detector.py
index 75cd3b1202f..ef1f91730bf 100644
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@@ -83,7 +83,7 @@ def detect_windows(self, images_windows):
         for ix, window_in in enumerate(window_inputs):
             caffe_in[ix] = self.transformer.preprocess(in_, window_in)
         out = self.forward_all(**{in_: caffe_in})
-        predictions = out[self.outputs[0]].squeeze(axis=(2, 3))
+        predictions = out[self.outputs[0]]
 
         # Package predictions with images and windows.
         detections = []
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index cfa3fc5b1fb..8411a41d1d4 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -91,11 +91,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
                       separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
                       separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
@@ -127,7 +127,7 @@ def choose_color_by_layertype(layertype):
     return color
 
 
-def get_pydot_graph(caffe_net, rankdir, label_edges=True):
+def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
     """Create a data structure which represents the `caffe_net`.
 
     Parameters
@@ -137,17 +137,33 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
         Direction of graph layout.
     label_edges : boolean, optional
         Label the edges (default is True).
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)
 
     Returns
     -------
     pydot graph object
     """
-    pydot_graph = pydot.Dot(caffe_net.name,
+    pydot_graph = pydot.Dot(caffe_net.name if caffe_net.name else 'Net',
                             graph_type='digraph',
                             rankdir=rankdir)
     pydot_nodes = {}
     pydot_edges = []
     for layer in caffe_net.layer:
+        if phase is not None:
+          included = False
+          if len(layer.include) == 0:
+            included = True
+          if len(layer.include) > 0 and len(layer.exclude) > 0:
+            raise ValueError('layer ' + layer.name + ' has both include '
+                             'and exclude specified.')
+          for layer_phase in layer.include:
+            included = included or layer_phase.phase == phase
+          for layer_phase in layer.exclude:
+            included = included and not layer_phase.phase == phase
+          if not included:
+            continue
         node_label = get_layer_label(layer, rankdir)
         node_name = "%s_%s" % (layer.name, layer.type)
         if (len(layer.bottom) == 1 and len(layer.top) == 1 and
@@ -186,7 +202,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
     return pydot_graph
 
 
-def draw_net(caffe_net, rankdir, ext='png'):
+def draw_net(caffe_net, rankdir, ext='png', phase=None):
     """Draws a caffe net and returns the image string encoded using the given
     extension.
 
@@ -195,16 +211,19 @@ def draw_net(caffe_net, rankdir, ext='png'):
     caffe_net : a caffe.proto.caffe_pb2.NetParameter protocol buffer.
     ext : string, optional
         The image extension (the default is 'png').
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)
 
     Returns
     -------
     string :
         Postscript representation of the graph.
     """
-    return get_pydot_graph(caffe_net, rankdir).create(format=ext)
+    return get_pydot_graph(caffe_net, rankdir, phase=phase).create(format=ext)
 
 
-def draw_net_to_file(caffe_net, filename, rankdir='LR'):
+def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
     """Draws a caffe net, and saves it to file using the format given as the
     file extension. Use '.raw' to output raw text that you can manually feed
     to graphviz to draw graphs.
@@ -216,7 +235,10 @@ def draw_net_to_file(caffe_net, filename, rankdir='LR'):
         The path to a file where the networks visualization will be stored.
     rankdir : {'LR', 'TB', 'BT'}
         Direction of graph layout.
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)
     """
     ext = filename[filename.rfind('.')+1:]
     with open(filename, 'wb') as fid:
-        fid.write(draw_net(caffe_net, rankdir, ext))
+        fid.write(draw_net(caffe_net, rankdir, ext, phase))
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 75310589cec..966c164cffd 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -46,7 +46,7 @@ def array_to_blobproto(arr, diff=None):
     return blob
 
 
-def arraylist_to_blobprotovecor_str(arraylist):
+def arraylist_to_blobprotovector_str(arraylist):
     """Converts a list of arrays to a serialized blobprotovec, which could be
     then passed to a network for processing.
     """
@@ -63,7 +63,7 @@ def blobprotovector_str_to_arraylist(str):
     return [blobproto_to_array(blob) for blob in vec.blobs]
 
 
-def array_to_datum(arr, label=0):
+def array_to_datum(arr, label=None):
     """Converts a 3-dimensional array to datum. If the array has dtype uint8,
     the output data will be encoded as a string. Otherwise, the output data
     will be stored in float format.
@@ -75,8 +75,9 @@ def array_to_datum(arr, label=0):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
-    datum.label = label
+        datum.float_data.extend(arr.astype(float).flat)
+    if label is not None:
+        datum.label = label
     return datum
 
 
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 63de4cce4b2..20918f9b6bc 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -32,7 +32,7 @@ def param_name_dict():
     # get all parameter names (typically underscore case) and corresponding
     # type names (typically camel case), which contain the layer names
     # (note that not all parameters correspond to layers, but we'll ignore that)
-    param_names = [s for s in dir(layer) if s.endswith('_param')]
+    param_names = [f.name for f in layer.DESCRIPTOR.fields if f.name.endswith('_param')]
     param_type_names = [type(getattr(layer, s)).__name__ for s in param_names]
     # strip the final '_param' or 'Parameter'
     param_names = [s[:-len('_param')] for s in param_names]
@@ -103,6 +103,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index c5c0b824a77..f16f7ace5ea 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
-        RMSPropSolver, AdaDeltaSolver, AdamSolver
+        RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
 import caffe.io
 
 import six
@@ -27,7 +27,9 @@ def _Net_blobs(self):
     An OrderedDict (bottom to top, i.e., input to output) of network
     blobs indexed by name
     """
-    return OrderedDict(zip(self._blob_names, self._blobs))
+    if not hasattr(self, '_blobs_dict'):
+        self._blobs_dict = OrderedDict(zip(self._blob_names, self._blobs))
+    return self._blobs_dict
 
 
 @property
@@ -36,7 +38,20 @@ def _Net_blob_loss_weights(self):
     An OrderedDict (bottom to top, i.e., input to output) of network
     blob loss weights indexed by name
     """
-    return OrderedDict(zip(self._blob_names, self._blob_loss_weights))
+    if not hasattr(self, '_blobs_loss_weights_dict'):
+        self._blob_loss_weights_dict = OrderedDict(zip(self._blob_names,
+                                                       self._blob_loss_weights))
+    return self._blob_loss_weights_dict
+
+@property
+def _Net_layer_dict(self):
+    """
+    An OrderedDict (bottom to top, i.e., input to output) of network
+    layers indexed by name
+    """
+    if not hasattr(self, '_layer_dict'):
+        self._layer_dict = OrderedDict(zip(self._layer_names, self.layers))
+    return self._layer_dict
 
 
 @property
@@ -46,19 +61,167 @@ def _Net_params(self):
     parameters indexed by name; each is a list of multiple blobs (e.g.,
     weights and biases)
     """
-    return OrderedDict([(name, lr.blobs)
-                        for name, lr in zip(self._layer_names, self.layers)
-                        if len(lr.blobs) > 0])
+    if not hasattr(self, '_params_dict'):
+        self._params_dict = OrderedDict([(name, lr.blobs)
+                                        for name, lr in zip(
+                                            self._layer_names, self.layers)
+                                        if len(lr.blobs) > 0])
+    return self._params_dict
+
+
+def _Net_zero(self, zero_param_diffs = True):
+    """
+    Set all activations (data and diffs) in the net to zero.
+
+    Take
+    zero_param_diffs: If True, also zero the parameter blob diffs,
+                      else skip parameter blobs.
+    """
+    
+    for blob_name, blob in self.blobs.items():
+        blob.data[...] = 0
+        blob.diff[...] = 0
+    if zero_param_diffs:
+        for param_name, blob_vec in self.params.items():
+            for blob in blob_vec:
+                blob.diff[...] = 0
+
+
+def _Net_backward_from_layer(self, start_name, start_diff, diffs=None, zero_higher=False):
+    """
+    Backward pass starting from somewhere in the middle of the
+    network, starting with the provided diffs.
+
+    Take
+    start_name: layer at which to begin the backward pass
+    start_diff: diff to set at start_name layer
+    diffs: list of diffs to return in addition to bottom diffs.
+    zero_higher: whether or not to zero out higher layers to reflect the true 0 derivative or leave them alone to save time.
+
+    Give
+    outs: {blob name: diff ndarray} dict.
+    """
+
+    start_top_name = self.top_names[start_name][0]
+    if start_diff.shape != self.blobs[start_top_name].diff.shape:
+        raise Exception('Expected start_diff of shape %s but got %s' % (self.blobs[start_top_name].diff.shape, start_diff.shape))
+
+    self.blobs[start_top_name].diff[...] = start_diff
+
+    if zero_higher:
+        past_start = False
+        for blob_name, blob in self.blobs.items():
+            if past_start:
+                blob.diff[...] = 0
+            if blob_name == start_top_name:
+                past_start = True
+
+    return self.backward(start=start_name, diffs=diffs)
+
+
+def _Net_deconv_from_layer(self, start_name, start_diff, diffs=None, zero_higher=False, deconv_type='Zeiler & Fergus'):
+    """
+    Deconv pass starting from somewhere in the middle of the
+    network, starting with the provided diffs.
+
+    Take
+    start_name: layer at which to begin the deconv pass
+    start_diff: diff to set at start_name layer
+    diffs: list of diffs to return in addition to bottom diffs.
+    zero_higher: whether or not to zero out higher layers to reflect the true 0 derivative or leave them alone to save time.
+    deconv_type: either 'Zeiler & Fergus' or 'Guided Backprop'
+
+    Give
+    outs: {blob name: diff ndarray} dict.
+    """
+
+    # convert deconv type string to int value
+    if deconv_type == 'Zeiler & Fergus':
+        deconv_type_int = 0
+    elif deconv_type == 'Guided Backprop':
+        deconv_type_int = 1
+    else:
+        raise Exception('Unsupported deconv type: %s' % (deconv_type))
+
+    start_top_name = self.top_names[start_name][0]
+    if start_diff.shape != self.blobs[start_top_name].diff.shape:
+        raise Exception('Expected start_diff of shape %s but got %s' % (self.blobs[start_top_name].diff.shape, start_diff.shape))
+
+    self.blobs[start_top_name].diff[...] = start_diff
+
+    if zero_higher:
+        past_start = False
+        for blob_name, blob in self.blobs.items():
+            if past_start:
+                blob.diff[...] = 0
+            if blob_name == start_top_name:
+                past_start = True
+
+    return self.deconv(start=start_name, diffs=diffs, deconv_type_int=deconv_type_int)
+
+
+def _Net_deconv(self, diffs=None, start=None, end=None, deconv_type_int=0, **kwargs):
+    """
+    Deconv pass: prepare diffs and run the net backward in deconv mode. Just like _Net_Backward but calls Deconv instead.
+
+    Take
+    diffs: list of diffs to return in addition to bottom diffs.
+    kwargs: Keys are output blob names and values are diff ndarrays.
+            If None, top diffs are taken from forward loss.
+    start: optional name of layer at which to begin the backward pass
+    end: optional name of layer at which to finish the backward pass (inclusive)
+    deconv_type_int: type of deconv to use, 0 for ZF, 1 for guided backprop
+
+    Give
+    outs: {blob name: diff ndarray} dict.
+    """
+    if diffs is None:
+        diffs = []
+
+    if start is not None:
+        start_ind = list(self._layer_names).index(start)
+    else:
+        start_ind = len(self.layers) - 1
+
+    if end is not None:
+        end_ind = list(self._layer_names).index(end)
+        outputs = set([end] + diffs)
+    else:
+        end_ind = 0
+        outputs = set(self.inputs + diffs)
+
+    if kwargs:
+        if set(kwargs.keys()) != set(self.outputs):
+            raise Exception('Top diff arguments do not match net outputs.')
+        # Set top diffs according to defined shapes and make arrays single and
+        # C-contiguous as Caffe expects.
+        for top, diff in kwargs.iteritems():
+            if diff.ndim != 4:
+                raise Exception('{} diff is not 4-d'.format(top))
+            if diff.shape[0] != self.blobs[top].num:
+                raise Exception('Diff is not batch sized')
+            self.blobs[top].diff[...] = diff
+
+    self._deconv(start_ind, end_ind, deconv_type_int)
+
+    # Unpack diffs to extract
+    return {out: self.blobs[out].diff for out in outputs}
 
 
 @property
 def _Net_inputs(self):
-    return [list(self.blobs.keys())[i] for i in self._inputs]
+    if not hasattr(self, '_input_list'):
+        keys = list(self.blobs.keys())
+        self._input_list = [keys[i] for i in self._inputs]
+    return self._input_list
 
 
 @property
 def _Net_outputs(self):
-    return [list(self.blobs.keys())[i] for i in self._outputs]
+    if not hasattr(self, '_output_list'):
+        keys = list(self.blobs.keys())
+        self._output_list = [keys[i] for i in self._outputs]
+    return self._output_list
 
 
 def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
@@ -89,7 +252,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -137,7 +300,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)
@@ -278,33 +441,48 @@ def _Net_batch(self, blobs):
                                                  padding])
         yield padded_batch
 
-
-class _Net_IdNameWrapper:
-    """
-    A simple wrapper that allows the ids propery to be accessed as a dict
-    indexed by names. Used for top and bottom names
+def _Net_get_id_name(func, field):
     """
-    def __init__(self, net, func):
-        self.net, self.func = net, func
+    Generic property that maps func to the layer names into an OrderedDict.
+
+    Used for top_names and bottom_names.
 
-    def __getitem__(self, name):
-        # Map the layer name to id
-        ids = self.func(self.net, list(self.net._layer_names).index(name))
-        # Map the blob id to name
-        id_to_name = list(self.net.blobs)
-        return [id_to_name[i] for i in ids]
+    Parameters
+    ----------
+    func: function id -> [id]
+    field: implementation field name (cache)
+
+    Returns
+    ------
+    A one-parameter function that can be set as a property.
+    """
+    @property
+    def get_id_name(self):
+        if not hasattr(self, field):
+            id_to_name = list(self.blobs)
+            res = OrderedDict([(self._layer_names[i],
+                                [id_to_name[j] for j in func(self, i)])
+                                for i in range(len(self.layers))])
+            setattr(self, field, res)
+        return getattr(self, field)
+    return get_id_name
 
 # Attach methods to Net.
 Net.blobs = _Net_blobs
 Net.blob_loss_weights = _Net_blob_loss_weights
+Net.layer_dict = _Net_layer_dict
 Net.params = _Net_params
+Net.zero = _Net_zero
+Net.backward_from_layer = _Net_backward_from_layer
+Net.deconv_from_layer = _Net_deconv_from_layer
 Net.forward = _Net_forward
 Net.backward = _Net_backward
+Net.deconv = _Net_deconv
 Net.forward_all = _Net_forward_all
 Net.forward_backward_all = _Net_forward_backward_all
 Net.set_input_arrays = _Net_set_input_arrays
 Net._batch = _Net_batch
 Net.inputs = _Net_inputs
 Net.outputs = _Net_outputs
-Net.top_names = property(lambda n: _Net_IdNameWrapper(n, Net._top_ids))
-Net.bottom_names = property(lambda n: _Net_IdNameWrapper(n, Net._bottom_ids))
+Net.top_names = _Net_get_id_name(Net._top_ids, "_top_names")
+Net.bottom_names = _Net_get_id_name(Net._bottom_ids, "_bottom_names")
diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 00000000000..835bb5df010
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+
+from google.protobuf import text_format
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/caffe/test/test_io.py b/python/caffe/test/test_io.py
index 8c86ef75fb2..4a16b5b9128 100644
--- a/python/caffe/test/test_io.py
+++ b/python/caffe/test/test_io.py
@@ -39,3 +39,18 @@ def test_scalar(self):
 
         arr = caffe.io.blobproto_to_array(blob)
         self.assertEqual(arr, 123)
+
+
+class TestArrayToDatum(unittest.TestCase):
+
+    def test_label_none_size(self):
+        # Set label
+        d1 = caffe.io.array_to_datum(
+            np.ones((10,10,3)), label=1)
+        # Don't set label
+        d2 = caffe.io.array_to_datum(
+            np.ones((10,10,3)))
+        # Not setting the label should result in a smaller object
+        self.assertGreater(
+            len(d1.SerializeToString()),
+            len(d2.SerializeToString()))
diff --git a/python/caffe/test/test_nccl.py b/python/caffe/test/test_nccl.py
new file mode 100644
index 00000000000..127a9337040
--- /dev/null
+++ b/python/caffe/test/test_nccl.py
@@ -0,0 +1,19 @@
+import sys
+import unittest
+
+import caffe
+
+
+class TestNCCL(unittest.TestCase):
+
+    def test_newuid(self):
+        """
+        Test that NCCL uids are of the proper type
+        according to python version
+        """
+        if caffe.has_nccl():
+            uid = caffe.NCCL.new_uid()
+            if sys.version_info.major >= 3:
+                self.assertTrue(isinstance(uid, bytes))
+            else:
+                self.assertTrue(isinstance(uid, str))
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index aad828aa8aa..afd27690981 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -3,6 +3,7 @@
 import os
 import numpy as np
 import six
+from collections import OrderedDict
 
 import caffe
 
@@ -24,11 +25,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -59,23 +60,330 @@ def test_memory(self):
         for bl in blobs:
             total += bl.data.sum() + bl.diff.sum()
 
+    def test_layer_dict(self):
+        layer_dict = self.net.layer_dict
+        self.assertEqual(list(layer_dict.keys()), list(self.net._layer_names))
+        for i, name in enumerate(self.net._layer_names):
+            self.assertEqual(layer_dict[name].type,
+                             self.net.layers[i].type)
+
     def test_forward_backward(self):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=conv_blob.data.shape);
+        sample_data=sample_data.astype(np.float32);
+        conv_blob.data[:]=sample_data;
+        forward_blob=self.net.forward(start='ip',end='ip');
+        self.assertIn('ip_blob',forward_blob);
+
+        manual_forward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1));
+          manual_forward.append(dot+self.net.params['ip'][1].data);
+        manual_forward=np.array(manual_forward);
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+
+    def test_backward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=ip_blob.data.shape)
+        sample_data=sample_data.astype(np.float32);
+        ip_blob.diff[:]=sample_data;
+        backward_blob=self.net.backward(start='ip',end='ip');
+        self.assertIn('conv',backward_blob);
+
+        manual_backward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data.transpose(),
+                     sample_data[i].reshape(-1));
+          manual_backward.append(dot);
+        manual_backward=np.array(manual_backward);
+        manual_backward=manual_backward.reshape(conv_blob.data.shape);
+
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
+
+    def test_clear_param_diffs(self):
+        # Run a forward/backward step to have non-zero diffs
+        self.net.forward()
+        self.net.backward()
+        diff = self.net.params["conv"][0].diff
+        # Check that we have non-zero diffs
+        self.assertTrue(diff.max() > 0)
+        self.net.clear_param_diffs()
+        # Check that the diffs are now 0
+        self.assertTrue((diff == 0).all())
+
     def test_inputs_outputs(self):
         self.assertEqual(self.net.inputs, [])
         self.assertEqual(self.net.outputs, ['loss'])
 
+    def test_top_bottom_names(self):
+        self.assertEqual(self.net.top_names,
+                         OrderedDict([('data', ['data', 'label']),
+                                      ('conv', ['conv']),
+                                      ('ip', ['ip_blob']),
+                                      ('loss', ['loss'])]))
+        self.assertEqual(self.net.bottom_names,
+                         OrderedDict([('data', []),
+                                      ('conv', ['data']),
+                                      ('ip', ['conv']),
+                                      ('loss', ['ip_blob', 'label'])]))
+
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         f.close()
         self.net.save(f.name)
         net_file = simple_net_file(self.num_output)
-        net2 = caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test legacy constructor
+        #   should print deprecation warning
+        caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test named constructor
+        net2 = caffe.Net(net_file, caffe.TRAIN, weights=f.name)
         os.remove(net_file)
         os.remove(f.name)
         for name in self.net.params:
             for i in range(len(self.net.params[name])):
                 self.assertEqual(abs(self.net.params[name][i].data
                     - net2.params[name][i].data).sum(), 0)
+
+    def test_save_hdf5(self):
+        f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        f.close()
+        self.net.save_hdf5(f.name)
+        net_file = simple_net_file(self.num_output)
+        net2 = caffe.Net(net_file, caffe.TRAIN)
+        net2.load_hdf5(f.name)
+        os.remove(net_file)
+        os.remove(f.name)
+        for name in self.net.params:
+            for i in range(len(self.net.params[name])):
+                self.assertEqual(abs(self.net.params[name][i].data
+                    - net2.params[name][i].data).sum(), 0)
+
+class TestLevels(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "NoLevel"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "NoLevel"
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level0Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level0Only"
+  include { min_level: 0 max_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level1Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level1Only"
+  include { min_level: 1 max_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=0"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=0"
+  include { min_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=1"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=1"
+  include { min_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_0(self):
+        net = caffe.Net(self.f.name, caffe.TEST)
+        self.check_net(net, ['NoLevel', 'Level0Only', 'Level>=0'])
+
+    def test_1(self):
+        net = caffe.Net(self.f.name, caffe.TEST, level=1)
+        self.check_net(net, ['NoLevel', 'Level1Only', 'Level>=0', 'Level>=1'])
+
+
+class TestStages(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "A"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "A"
+  include { stage: "A" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "B"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "B"
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AorB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AorB"
+  include { stage: "A" }
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AandB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AandB"
+  include { stage: "A" stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_A(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A'])
+        self.check_net(net, ['A', 'AorB'])
+
+    def test_B(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['B'])
+        self.check_net(net, ['B', 'AorB'])
+
+    def test_AandB(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A', 'B'])
+        self.check_net(net, ['A', 'B', 'AorB', 'AandB'])
+
+
+class TestAllInOne(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "train_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TRAIN stage: "train" }
+}
+layer {
+  name: "val_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TEST stage: "val" }
+}
+layer {
+  name: "deploy_data"
+  type: "Input"
+  top: "data"
+  input_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+  include { phase: TEST stage: "deploy" }
+}
+layer {
+  name: "ip"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "ip"
+  inner_product_param { num_output: 2 }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip"
+  bottom: "label"
+  top: "loss"
+  include: { phase: TRAIN stage: "train" }
+  include: { phase: TEST stage: "val" }
+}
+layer {
+  name: "pred"
+  type: "Softmax"
+  bottom: "ip"
+  top: "pred"
+  include: { phase: TEST stage: "deploy" }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, outputs):
+        self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])
+        self.assertEqual(net.outputs, outputs)
+
+    def test_train(self):
+        net = caffe.Net(self.f.name, caffe.TRAIN, stages=['train'])
+        self.check_net(net, ['loss'])
+
+    def test_val(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['val'])
+        self.check_net(net, ['loss'])
+
+    def test_deploy(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['deploy'])
+        self.check_net(net, ['pred'])
+
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0aaebe..ffe71bacb08 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -79,3 +79,11 @@ def test_zero_tops(self):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top
diff --git a/python/caffe/test/test_python_layer.py b/python/caffe/test/test_python_layer.py
index e46b7118014..899514e90f1 100644
--- a/python/caffe/test/test_python_layer.py
+++ b/python/caffe/test/test_python_layer.py
@@ -44,6 +44,18 @@ def forward(self, bottom, top):
     def backward(self, top, propagate_down, bottom):
         self.blobs[0].diff[0] = 1
 
+class PhaseLayer(caffe.Layer):
+    """A layer for checking attribute `phase`"""
+
+    def setup(self, bottom, top):
+        pass
+
+    def reshape(self, bootom, top):
+        top[0].reshape()
+
+    def forward(self, bottom, top):
+        top[0].data[()] = self.phase
+
 def python_net_file():
     with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
         f.write("""name: 'pythonnet' force_backward: true
@@ -76,6 +88,14 @@ def parameter_net_file():
           """)
         return f.name
 
+def phase_net_file():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
+        f.write("""name: 'pythonnet' force_backward: true
+        layer { type: 'Python' name: 'layer' top: 'phase'
+          python_param { module: 'test_python_layer' layer: 'PhaseLayer' } }
+          """)
+        return f.name
+
 
 @unittest.skipIf('Python' not in caffe.layer_type_list(),
     'Caffe built without Python layer support')
@@ -140,3 +160,9 @@ def test_parameter(self):
         self.assertEqual(layer.blobs[0].data[0], 1)
 
         os.remove(net_file)
+
+    def test_phase(self):
+        net_file = phase_net_file()
+        for phase in caffe.TRAIN, caffe.TEST:
+            net = caffe.Net(net_file, phase)
+            self.assertEqual(net.forward()['phase'], phase)
diff --git a/python/draw_net.py b/python/draw_net.py
index ec76a744da3..dfe70d26a71 100755
--- a/python/draw_net.py
+++ b/python/draw_net.py
@@ -28,6 +28,11 @@ def parse_args():
                               'http://www.graphviz.org/doc/info/'
                               'attrs.html#k:rankdir'),
                         default='LR')
+    parser.add_argument('--phase',
+                        help=('Which network phase to draw: can be TRAIN, '
+                              'TEST, or ALL.  If ALL, then all layers are drawn '
+                              'regardless of phase.'),
+                        default="ALL")
 
     args = parser.parse_args()
     return args
@@ -38,7 +43,15 @@ def main():
     net = caffe_pb2.NetParameter()
     text_format.Merge(open(args.input_net_proto_file).read(), net)
     print('Drawing net to %s' % args.output_image_file)
-    caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir)
+    phase=None;
+    if args.phase == "TRAIN":
+        phase = caffe.TRAIN
+    elif args.phase == "TEST":
+        phase = caffe.TEST
+    elif args.phase != "ALL":
+        raise ValueError("Unknown phase: " + args.phase)
+    caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
+                                phase)
 
 
 if __name__ == '__main__':
diff --git a/python/train.py b/python/train.py
new file mode 100644
index 00000000000..5897f5dcb90
--- /dev/null
+++ b/python/train.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+"""
+Trains a model using one or more GPUs.
+"""
+from multiprocessing import Process
+
+import caffe
+
+
+def train(
+        solver,  # solver proto definition
+        snapshot,  # solver snapshot to restore
+        gpus,  # list of device ids
+        timing=False,  # show timing info for compute and communications
+):
+    # NCCL uses a uid to identify a session
+    uid = caffe.NCCL.new_uid()
+
+    caffe.init_log()
+    caffe.log('Using devices %s' % str(gpus))
+
+    procs = []
+    for rank in range(len(gpus)):
+        p = Process(target=solve,
+                    args=(solver, snapshot, gpus, timing, uid, rank))
+        p.daemon = True
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+
+
+def time(solver, nccl):
+    fprop = []
+    bprop = []
+    total = caffe.Timer()
+    allrd = caffe.Timer()
+    for _ in range(len(solver.net.layers)):
+        fprop.append(caffe.Timer())
+        bprop.append(caffe.Timer())
+    display = solver.param.display
+
+    def show_time():
+        if solver.iter % display == 0:
+            s = '\n'
+            for i in range(len(solver.net.layers)):
+                s += 'forw %3d %8s ' % (i, solver.net._layer_names[i])
+                s += ': %.2f\n' % fprop[i].ms
+            for i in range(len(solver.net.layers) - 1, -1, -1):
+                s += 'back %3d %8s ' % (i, solver.net._layer_names[i])
+                s += ': %.2f\n' % bprop[i].ms
+            s += 'solver total: %.2f\n' % total.ms
+            s += 'allreduce: %.2f\n' % allrd.ms
+            caffe.log(s)
+
+    solver.net.before_forward(lambda layer: fprop[layer].start())
+    solver.net.after_forward(lambda layer: fprop[layer].stop())
+    solver.net.before_backward(lambda layer: bprop[layer].start())
+    solver.net.after_backward(lambda layer: bprop[layer].stop())
+    solver.add_callback(lambda: total.start(), lambda: (total.stop(), allrd.start()))
+    solver.add_callback(nccl)
+    solver.add_callback(lambda: '', lambda: (allrd.stop(), show_time()))
+
+
+def solve(proto, snapshot, gpus, timing, uid, rank):
+    caffe.set_mode_gpu()
+    caffe.set_device(gpus[rank])
+    caffe.set_solver_count(len(gpus))
+    caffe.set_solver_rank(rank)
+    caffe.set_multiprocess(True)
+
+    solver = caffe.SGDSolver(proto)
+    if snapshot and len(snapshot) != 0:
+        solver.restore(snapshot)
+
+    nccl = caffe.NCCL(solver, uid)
+    nccl.bcast()
+
+    if timing and rank == 0:
+        time(solver, nccl)
+    else:
+        solver.add_callback(nccl)
+
+    if solver.param.layer_wise_reduce:
+        solver.net.after_backward(nccl)
+    solver.step(solver.param.max_iter)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--solver", required=True, help="Solver proto definition.")
+    parser.add_argument("--snapshot", help="Solver snapshot to restore.")
+    parser.add_argument("--gpus", type=int, nargs='+', default=[0],
+                        help="List of device ids.")
+    parser.add_argument("--timing", action='store_true', help="Show timing info.")
+    args = parser.parse_args()
+
+    train(args.solver, args.snapshot, args.gpus, args.timing)
diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh
index 0e28bd71631..4837587ad30 100755
--- a/scripts/build_docs.sh
+++ b/scripts/build_docs.sh
@@ -12,6 +12,9 @@ cd $ROOT_DIR
 # Gather docs.
 scripts/gather_examples.sh
 
+# Split caffe.proto for inclusion by layer catalogue.
+scripts/split_caffe_proto.py
+
 # Generate developer docs.
 make docs
 
diff --git a/scripts/caffe b/scripts/caffe
new file mode 100644
index 00000000000..8a0b22af6ac
--- /dev/null
+++ b/scripts/caffe
@@ -0,0 +1,73 @@
+# bash completion for Caffe's command line utility       -*- shell-script -*-
+# COPYRIGHT (C) 2015,2016 Zhou Mo <cdluminate@gmail.com>
+# License: BSD-2-Clause
+# Originally appeard at https://github.com/BVLC/caffe/issues/3149
+
+# Updated for caffe (1.0.0~rc3+20160715-g42cd785)
+_caffe()
+{
+  local cur prev words cword
+  _init_completion -s || return
+
+  local prototxts='@(prototxt)'
+  local caffemodels='@(caffemodel,binaryproto)'
+  local solverstates='@(solverstate)'
+  local caffefiles='@(prototxt|caffemodel|solverstate)'
+
+  local flags='-gpu -iterations -model -snapshot -solver -weights -sighup_effect -sigint_effect -level -stage -phase'
+  
+  if [[ $cword -eq 1 ]]; then
+    COMPREPLY=( $( compgen -W 'train test time device_query' -- "$cur" ) )
+    return 0
+  fi
+  
+  if [[ $cword -eq 2 ]]; then
+    case ${words[1]} in
+    train|test|device_query|time)
+      COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+      return 0
+      ;;
+    *)
+      return 0
+      ;;
+    esac
+  fi
+
+  case $prev in
+  -gpu|-iterations|-version|-level|-stage)
+    return 0
+    ;;
+  -solver|-model)
+    _filedir $prototxts
+    return 0
+    ;;
+  -weights)
+    _filedir $caffemodels
+    return 0
+    ;;
+  -snapshot)
+    _filedir $solverstates
+    return 0
+    ;;
+  -sighup_effect|-sigint_effect)
+    COMPREPLY=( $( compgen -W 'snapshot stop none' -- "$cur") )
+    return 0
+    ;;
+  -phase)
+    COMPREPLY=( $( compgen -W 'TRAIN TEST' -- "$cur") )
+    return 0
+    ;;
+  *)
+    COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+    return 0
+    ;;
+  esac
+
+  # file completion on relevant files
+  _filedir "$caffefiles"
+
+  return 0
+}
+complete -F _caffe caffe
+
+# vim
diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 14c76ecd6bf..b2016d4b6dd 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -52,6 +52,10 @@
 import sys
 import unicodedata
 
+import six
+
+from six import iteritems, itervalues
+from six.moves import xrange
 
 _USAGE = """
 Syntax: cpp_lint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
@@ -756,7 +760,7 @@ def IncrementErrorCount(self, category):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in iteritems(self.errors_by_category):
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -3444,16 +3448,16 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+  if six.PY2:
+    if isinstance(line, unicode):
+      width = 0
+      for uc in unicodedata.normalize('NFC', line):
+        if unicodedata.east_asian_width(uc) in ('W', 'F'):
+          width += 2
+        elif not unicodedata.combining(uc):
+          width += 1
+      return width
+  return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
@@ -3774,7 +3778,7 @@ def _GetTextInside(text, start_pattern):
 
   # Give opening punctuations to get the matching close-punctuations.
   matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
+  closing_punctuation = set(itervalues(matching_punctuation))
 
   # Find the position to start extracting text.
   match = re.search(start_pattern, text, re.M)
@@ -4460,7 +4464,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4532,7 +4536,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # Let's copy the include_state so it is only messed up within this function.
   include_state = include_state.copy()
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4833,7 +4837,7 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
 
   if not filenames:
     PrintUsage('No files were specified.')
@@ -4851,10 +4855,11 @@ def main():
 
   # Change stderr to write with replacement characters so we don't die
   # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+  if six.PY2:
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                          codecs.getreader('utf8'),
+                                          codecs.getwriter('utf8'),
+                                          'replace')
 
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
diff --git a/scripts/download_model_binary.py b/scripts/download_model_binary.py
index 66f72f2477e..a72fd5d76ba 100755
--- a/scripts/download_model_binary.py
+++ b/scripts/download_model_binary.py
@@ -3,10 +3,11 @@
 import sys
 import time
 import yaml
-import urllib
 import hashlib
 import argparse
 
+from six.moves import urllib
+
 required_keys = ['caffemodel', 'caffemodel_url', 'sha1']
 
 
@@ -60,7 +61,7 @@ def valid_dirname(dirname):
 
     # Closure-d function for checking SHA1.
     def model_checks_out(filename=model_filename, sha1=frontmatter['sha1']):
-        with open(filename, 'r') as f:
+        with open(filename, 'rb') as f:
             return hashlib.sha1(f.read()).hexdigest() == sha1
 
     # Check if model exists.
@@ -69,7 +70,7 @@ def model_checks_out(filename=model_filename, sha1=frontmatter['sha1']):
         sys.exit(0)
 
     # Download and verify model.
-    urllib.urlretrieve(
+    urllib.request.urlretrieve(
         frontmatter['caffemodel_url'], model_filename, reporthook)
     if not model_checks_out():
         print('ERROR: model did not download correctly! Run this again.')
diff --git a/scripts/split_caffe_proto.py b/scripts/split_caffe_proto.py
new file mode 100755
index 00000000000..7e9dc3e7b22
--- /dev/null
+++ b/scripts/split_caffe_proto.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import mmap
+import re
+import os
+import errno
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+
+# a regex to match the parameter definitions in caffe.proto
+r = re.compile(r'(?://.*\n)*message ([^ ]*) \{\n(?: .*\n|\n)*\}')
+
+# create directory to put caffe.proto fragments
+try:
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/'))
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/proto/'))
+except OSError as exception:
+    if exception.errno != errno.EEXIST:
+        raise
+
+caffe_proto_fn = os.path.join(
+    script_path,
+    '../src/caffe/proto/caffe.proto')
+
+with open(caffe_proto_fn, 'r') as fin:
+
+    for m in r.finditer(fin.read(), re.MULTILINE):
+        fn = os.path.join(
+            script_path,
+            '../docs/_includes/proto/%s.txt' % m.group(1))
+        with open(fn, 'w') as fout:
+            fout.write(m.group(0))
diff --git a/scripts/travis/build.sh b/scripts/travis/build.sh
new file mode 100755
index 00000000000..bb9406f046c
--- /dev/null
+++ b/scripts/travis/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# build the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_CMAKE ; then
+  make --jobs $NUM_THREADS all test pycaffe warn
+else
+  cd build
+  make --jobs $NUM_THREADS all test.testbin
+fi
+make lint
diff --git a/scripts/travis/configure-cmake.sh b/scripts/travis/configure-cmake.sh
new file mode 100644
index 00000000000..772f1e2ce8d
--- /dev/null
+++ b/scripts/travis/configure-cmake.sh
@@ -0,0 +1,32 @@
+# CMake configuration
+
+mkdir -p build
+cd build
+
+ARGS="-DCMAKE_BUILD_TYPE=Release -DBLAS=Open"
+
+if $WITH_PYTHON3 ; then
+  ARGS="$ARGS -Dpython_version=3"
+fi
+
+if $WITH_IO ; then
+  ARGS="$ARGS -DUSE_OPENCV=On -DUSE_LMDB=On -DUSE_LEVELDB=On"
+else
+  ARGS="$ARGS -DUSE_OPENCV=Off -DUSE_LMDB=Off -DUSE_LEVELDB=Off"
+fi
+
+if $WITH_CUDA ; then
+  # Only build SM50
+  ARGS="$ARGS -DCPU_ONLY=Off -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=\"50\" -DCUDA_ARCH_PTX=\"\""
+else
+  ARGS="$ARGS -DCPU_ONLY=On"
+fi
+
+if $WITH_CUDNN ; then
+  ARGS="$ARGS -DUSE_CUDNN=On"
+else
+  ARGS="$ARGS -DUSE_CUDNN=Off"
+fi
+
+cmake .. $ARGS
+
diff --git a/scripts/travis/configure-make.sh b/scripts/travis/configure-make.sh
new file mode 100644
index 00000000000..ddc40fffa9d
--- /dev/null
+++ b/scripts/travis/configure-make.sh
@@ -0,0 +1,36 @@
+# raw Makefile configuration
+
+LINE () {
+  echo "$@" >> Makefile.config
+}
+
+cp Makefile.config.example Makefile.config
+
+LINE "BLAS := open"
+LINE "WITH_PYTHON_LAYER := 1"
+
+if $WITH_PYTHON3 ; then
+  # TODO(lukeyeager) this path is currently disabled because of test errors like:
+  #   ImportError: dynamic module does not define init function (PyInit__caffe)
+  LINE "PYTHON_LIBRARIES := python3.4m boost_python-py34"
+  LINE "PYTHON_INCLUDE := /usr/include/python3.4 /usr/lib/python3/dist-packages/numpy/core/include"
+  LINE "INCLUDE_DIRS := \$(INCLUDE_DIRS) \$(PYTHON_INCLUDE)"
+fi
+
+if ! $WITH_IO ; then
+  LINE "USE_OPENCV := 0"
+  LINE "USE_LEVELDB := 0"
+  LINE "USE_LMDB := 0"
+fi
+
+if $WITH_CUDA ; then
+  # Only build SM50
+  LINE "CUDA_ARCH := -gencode arch=compute_50,code=sm_50"
+else
+  LINE "CPU_ONLY := 1"
+fi
+
+if $WITH_CUDNN ; then
+  LINE "USE_CUDNN := 1"
+fi
+
diff --git a/scripts/travis/configure.sh b/scripts/travis/configure.sh
new file mode 100755
index 00000000000..ef740c8982e
--- /dev/null
+++ b/scripts/travis/configure.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# configure the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_CMAKE ; then
+  source $BASEDIR/configure-make.sh
+else
+  source $BASEDIR/configure-cmake.sh
+fi
diff --git a/scripts/travis/defaults.sh b/scripts/travis/defaults.sh
new file mode 100755
index 00000000000..d69c0a7d964
--- /dev/null
+++ b/scripts/travis/defaults.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# set default environment variables
+
+set -e
+
+WITH_CMAKE=${WITH_CMAKE:-false}
+WITH_PYTHON3=${WITH_PYTHON3:-false}
+WITH_IO=${WITH_IO:-true}
+WITH_CUDA=${WITH_CUDA:-false}
+WITH_CUDNN=${WITH_CUDNN:-false}
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
new file mode 100755
index 00000000000..2fa2a74a486
--- /dev/null
+++ b/scripts/travis/install-deps.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# install dependencies
+# (this script must be run as root)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+apt-get -y update
+apt-get install -y --no-install-recommends \
+  build-essential \
+  graphviz \
+  libboost-filesystem-dev \
+  libboost-python-dev \
+  libboost-system-dev \
+  libboost-thread-dev \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libhdf5-serial-dev \
+  libopenblas-dev \
+  python-virtualenv \
+  wget
+
+if $WITH_CMAKE ; then
+  apt-get install -y --no-install-recommends cmake
+fi
+
+if ! $WITH_PYTHON3 ; then
+  # Python2
+  apt-get install -y --no-install-recommends \
+    libprotobuf-dev \
+    protobuf-compiler \
+    python-dev \
+    python-numpy \
+    python-protobuf \
+    python-pydot \
+    python-skimage
+else
+  # Python3
+  apt-get install -y --no-install-recommends \
+    python3-dev \
+    python3-numpy \
+    python3-skimage
+
+  # build Protobuf3 since it's needed for Python3
+  PROTOBUF3_DIR=~/protobuf3
+  pushd .
+  if [ -d "$PROTOBUF3_DIR" ] && [ -e "$PROTOBUF3_DIR/src/protoc" ]; then
+    echo "Using cached protobuf3 build ..."
+    cd $PROTOBUF3_DIR
+  else
+    echo "Building protobuf3 from source ..."
+    rm -rf $PROTOBUF3_DIR
+    mkdir $PROTOBUF3_DIR
+
+    # install some more dependencies required to build protobuf3
+    apt-get install -y --no-install-recommends \
+      curl \
+      dh-autoreconf \
+      unzip
+
+    wget https://github.com/google/protobuf/archive/3.0.x.tar.gz -O protobuf3.tar.gz
+    tar -xzf protobuf3.tar.gz -C $PROTOBUF3_DIR --strip 1
+    rm protobuf3.tar.gz
+    cd $PROTOBUF3_DIR
+    ./autogen.sh
+    ./configure --prefix=/usr
+    make --jobs=$NUM_THREADS
+  fi
+  make install
+  popd
+fi
+
+if $WITH_IO ; then
+  apt-get install -y --no-install-recommends \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libsnappy-dev
+fi
+
+if $WITH_CUDA ; then
+  # install repo packages
+  CUDA_REPO_PKG=cuda-repo-ubuntu1404_7.5-18_amd64.deb
+  wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG
+  dpkg -i $CUDA_REPO_PKG
+  rm $CUDA_REPO_PKG
+
+  if $WITH_CUDNN ; then
+    ML_REPO_PKG=nvidia-machine-learning-repo-ubuntu1404_4.0-2_amd64.deb
+    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKG
+    dpkg -i $ML_REPO_PKG
+  fi
+
+  # update package lists
+  apt-get -y update
+
+  # install packages
+  CUDA_PKG_VERSION="7-5"
+  CUDA_VERSION="7.5"
+  apt-get install -y --no-install-recommends \
+    cuda-core-$CUDA_PKG_VERSION \
+    cuda-cudart-dev-$CUDA_PKG_VERSION \
+    cuda-cublas-dev-$CUDA_PKG_VERSION \
+    cuda-curand-dev-$CUDA_PKG_VERSION
+  # manually create CUDA symlink
+  ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
+
+  if $WITH_CUDNN ; then
+    apt-get install -y --no-install-recommends libcudnn6-dev
+  fi
+fi
+
diff --git a/scripts/travis/install-python-deps.sh b/scripts/travis/install-python-deps.sh
new file mode 100755
index 00000000000..910d35a93be
--- /dev/null
+++ b/scripts/travis/install-python-deps.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# install extra Python dependencies
+# (must come after setup-venv)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_PYTHON3 ; then
+  # Python2
+  :
+else
+  # Python3
+  pip install --pre protobuf==3.0.0b3
+  pip install pydot
+fi
diff --git a/scripts/travis/setup-venv.sh b/scripts/travis/setup-venv.sh
new file mode 100755
index 00000000000..81245f146da
--- /dev/null
+++ b/scripts/travis/setup-venv.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# setup a Python virtualenv
+# (must come after install-deps)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+VENV_DIR=${1:-~/venv}
+
+# setup our own virtualenv
+if $WITH_PYTHON3; then
+    PYTHON_EXE='/usr/bin/python3'
+else
+    PYTHON_EXE='/usr/bin/python2'
+fi
+
+# use --system-site-packages so that Python will use deb packages
+virtualenv $VENV_DIR -p $PYTHON_EXE --system-site-packages
diff --git a/scripts/travis/test.sh b/scripts/travis/test.sh
new file mode 100755
index 00000000000..fedd7e6b56e
--- /dev/null
+++ b/scripts/travis/test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# test the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if $WITH_CUDA ; then
+  echo "Skipping tests for CUDA build"
+  exit 0
+fi
+
+if ! $WITH_CMAKE ; then
+  make runtest
+  make pytest
+else
+  cd build
+  make runtest
+  make pytest
+fi
diff --git a/scripts/travis/travis_build_and_test.sh b/scripts/travis/travis_build_and_test.sh
deleted file mode 100755
index 174f1ee5a0a..00000000000
--- a/scripts/travis/travis_build_and_test.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# Script called by Travis to build and test Caffe.
-# Travis CI tests are CPU-only for lack of compatible hardware.
-
-set -e
-MAKE="make --jobs=$NUM_THREADS --keep-going"
-
-if $WITH_CMAKE; then
-  mkdir build
-  cd build
-  CPU_ONLY=" -DCPU_ONLY=ON"
-  if ! $WITH_CUDA; then
-    CPU_ONLY=" -DCPU_ONLY=OFF"
-  fi
-  PYTHON_ARGS=""
-  if [ "$PYTHON_VERSION" = "3" ]; then
-    PYTHON_ARGS="$PYTHON_ARGS -Dpython_version=3 -DBOOST_LIBRARYDIR=$CONDA_DIR/lib/"
-  fi
-  if $WITH_IO; then
-    IO_ARGS="-DUSE_OPENCV=ON -DUSE_LMDB=ON -DUSE_LEVELDB=ON"
-  else
-    IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF"
-  fi
-  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS ..
-  $MAKE
-  $MAKE pytest
-  if ! $WITH_CUDA; then
-    $MAKE runtest
-    $MAKE lint
-  fi
-  $MAKE clean
-  cd -
-else
-  if ! $WITH_CUDA; then
-    export CPU_ONLY=1
-  fi
-  if $WITH_IO; then
-    export USE_LMDB=1
-    export USE_LEVELDB=1
-    export USE_OPENCV=1
-  fi
-  $MAKE all test pycaffe warn lint || true
-  if ! $WITH_CUDA; then
-    $MAKE runtest
-  fi
-  $MAKE all
-  $MAKE test
-  $MAKE pycaffe
-  $MAKE pytest
-  $MAKE warn
-  if ! $WITH_CUDA; then
-    $MAKE lint
-  fi
-fi
diff --git a/scripts/travis/travis_install.sh b/scripts/travis/travis_install.sh
deleted file mode 100755
index 091e92431f0..00000000000
--- a/scripts/travis/travis_install.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-# This script must be run with sudo.
-
-set -e
-
-MAKE="make --jobs=$NUM_THREADS"
-# Install apt packages where the Ubuntu 12.04 default and ppa works for Caffe
-
-# This ppa is for gflags and glog
-add-apt-repository -y ppa:tuleu/precise-backports
-apt-get -y update
-apt-get install \
-    wget git curl \
-    python-dev python-numpy python3-dev\
-    libleveldb-dev libsnappy-dev libopencv-dev \
-    libprotobuf-dev protobuf-compiler \
-    libatlas-dev libatlas-base-dev \
-    libhdf5-serial-dev libgflags-dev libgoogle-glog-dev \
-    bc
-
-# Add a special apt-repository to install CMake 2.8.9 for CMake Caffe build,
-# if needed.  By default, Aptitude in Ubuntu 12.04 installs CMake 2.8.7, but
-# Caffe requires a minimum CMake version of 2.8.8.
-if $WITH_CMAKE; then
-  # cmake 3 will make sure that the python interpreter and libraries match
-  wget --no-check-certificate http://www.cmake.org/files/v3.2/cmake-3.2.3-Linux-x86_64.sh -O cmake3.sh
-  chmod +x cmake3.sh
-  ./cmake3.sh --prefix=/usr/ --skip-license --exclude-subdir
-fi
-
-# Install CUDA, if needed
-if $WITH_CUDA; then
-  CUDA_URL=http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1204/x86_64/cuda-repo-ubuntu1204_6.5-14_amd64.deb
-  CUDA_FILE=/tmp/cuda_install.deb
-  curl $CUDA_URL -o $CUDA_FILE
-  dpkg -i $CUDA_FILE
-  rm -f $CUDA_FILE
-  apt-get -y update
-  # Install the minimal CUDA subpackages required to test Caffe build.
-  # For a full CUDA installation, add 'cuda' to the list of packages.
-  apt-get -y install cuda-core-6-5 cuda-cublas-6-5 cuda-cublas-dev-6-5 cuda-cudart-6-5 cuda-cudart-dev-6-5 cuda-curand-6-5 cuda-curand-dev-6-5
-  # Create CUDA symlink at /usr/local/cuda
-  # (This would normally be created by the CUDA installer, but we create it
-  # manually since we did a partial installation.)
-  ln -s /usr/local/cuda-6.5 /usr/local/cuda
-fi
-
-# Install LMDB
-LMDB_URL=https://github.com/LMDB/lmdb/archive/LMDB_0.9.14.tar.gz
-LMDB_FILE=/tmp/lmdb.tar.gz
-pushd .
-wget $LMDB_URL -O $LMDB_FILE
-tar -C /tmp -xzvf $LMDB_FILE
-cd /tmp/lmdb*/libraries/liblmdb/
-$MAKE
-$MAKE install
-popd
-rm -f $LMDB_FILE
-
-# Install the Python runtime dependencies via miniconda (this is much faster
-# than using pip for everything).
-export PATH=$CONDA_DIR/bin:$PATH
-# clear any cached conda (see #3786)
-rm -rf $CONDA_DIR
-if [ ! -d $CONDA_DIR ]; then
-  if [ "$PYTHON_VERSION" -eq "3" ]; then
-    wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-  else
-    wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
-  fi
-  chmod +x miniconda.sh
-  ./miniconda.sh -b -p $CONDA_DIR
-
-  conda update --yes conda
-  # The version of boost we're using for Python 3 depends on 3.4 for now.
-  if [ "$PYTHON_VERSION" -eq "3" ]; then
-    conda install --yes python=3.4
-  fi
-  conda install --yes numpy scipy matplotlib scikit-image pip
-  # Let conda install boost (so that boost_python matches)
-  conda install --yes -c https://conda.binstar.org/menpo boost=1.56.0
-fi
-
-# install protobuf 3 (just use the miniconda3 directory to avoid having to setup the path again)
-if [ "$PYTHON_VERSION" -eq "3" ] && [ ! -e "$CONDA_DIR/bin/protoc" ]; then
-  pushd .
-  wget https://github.com/google/protobuf/archive/v3.0.0-alpha-3.1.tar.gz -O protobuf-3.tar.gz
-  tar -C /tmp -xzvf protobuf-3.tar.gz
-  cd /tmp/protobuf-3*/
-  ./autogen.sh
-  ./configure --prefix=$CONDA_DIR
-  $MAKE
-  $MAKE install
-  popd
-fi
-
-if [ "$PYTHON_VERSION" -eq "3" ]; then
-  pip install --pre protobuf==3.0.0b2
-else
-  pip install protobuf
-fi
diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh
deleted file mode 100755
index 83aacf11fb0..00000000000
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mv Makefile.config.example Makefile.config
-
-if $WITH_CUDA; then
-  # Only generate compute_50.
-  GENCODE="-gencode arch=compute_50,code=sm_50"
-  GENCODE="$GENCODE -gencode arch=compute_50,code=compute_50"
-  echo "CUDA_ARCH := $GENCODE" >> Makefile.config
-fi
-
-# Remove IO library settings from Makefile.config
-# to avoid conflicts with CI configuration
-sed -i -e '/USE_LMDB/d' Makefile.config
-sed -i -e '/USE_LEVELDB/d' Makefile.config
-sed -i -e '/USE_OPENCV/d' Makefile.config
-
-cat << 'EOF' >> Makefile.config
-# Travis' nvcc doesn't like newer boost versions
-NVCCFLAGS := -Xcudafe --diag_suppress=cc_clobber_ignored -Xcudafe --diag_suppress=useless_using_declaration -Xcudafe --diag_suppress=set_but_not_used
-ANACONDA_HOME := $(CONDA_DIR)
-PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
-		$(ANACONDA_HOME)/include/python2.7 \
-		$(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
-PYTHON_LIB := $(ANACONDA_HOME)/lib
-INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
-LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
-WITH_PYTHON_LAYER := 1
-EOF
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 8a80c940488..4a805568566 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -3,9 +3,12 @@ file(GLOB proto_files proto/*.proto)
 caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 
 # include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend!
-caffe_default_properties(proto)
+add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+caffe_default_properties(caffeproto)
+target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
+target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+
+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
 
 # --[ Caffe library
 
@@ -18,8 +21,16 @@ if(HAVE_CUDA)
 endif()
 
 add_library(caffe ${srcs})
-target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
 caffe_default_properties(caffe)
+target_link_libraries(caffe ${Caffe_LINKER_LIBS})
+target_include_directories(caffe ${Caffe_INCLUDE_DIRS}
+                                 PUBLIC
+                                 $<BUILD_INTERFACE:${Caffe_INCLUDE_DIR}>
+                                 $<INSTALL_INTERFACE:include>)
+target_compile_definitions(caffe ${Caffe_DEFINITIONS})
+if(Caffe_COMPILE_OPTIONS)
+  target_compile_options(caffe ${Caffe_COMPILE_OPTIONS})
+endif()
 set_target_properties(caffe PROPERTIES
     VERSION   ${CAFFE_TARGET_VERSION}
     SOVERSION ${CAFFE_TARGET_SOVERSION}
@@ -29,12 +40,11 @@ set_target_properties(caffe PROPERTIES
  add_subdirectory(test)
 
 # ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
 install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
 
-
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index c86fd5d1d94..603e52f7025 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -30,7 +30,9 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
-    CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+    if (count_ != 0) {
+      CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+    }
     count_ *= shape[i];
     shape_[i] = shape[i];
     shape_data[i] = shape[i];
@@ -87,6 +89,12 @@ const Dtype* Blob<Dtype>::cpu_data() const {
 template <typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
   CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
   data_->set_cpu_data(data);
 }
 
@@ -96,6 +104,18 @@ const Dtype* Blob<Dtype>::gpu_data() const {
   return (const Dtype*)data_->gpu_data();
 }
 
+template <typename Dtype>
+void Blob<Dtype>::set_gpu_data(Dtype* data) {
+  CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
+  data_->set_gpu_data(data);
+}
+
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
   CHECK(diff_);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index dee681654aa..4f6f9bccc38 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -53,7 +53,7 @@ void GlobalInit(int* pargc, char*** pargv) {
 
 Caffe::Caffe()
     : random_generator_(), mode_(Caffe::CPU),
-      solver_count_(1), root_solver_(true) { }
+      solver_count_(1), solver_rank_(0), multiprocess_(false) { }
 
 Caffe::~Caffe() { }
 
@@ -106,7 +106,8 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
     : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
+    mode_(Caffe::CPU),
+    solver_count_(1), solver_rank_(0), multiprocess_(false) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
deleted file mode 100644
index 9f019bbfcb7..00000000000
--- a/src/caffe/data_reader.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <boost/thread.hpp>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-using boost::weak_ptr;
-
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
-static boost::mutex bodies_mutex_;
-
-DataReader::DataReader(const LayerParameter& param)
-    : queue_pair_(new QueuePair(  //
-        param.data_param().prefetch() * param.data_param().batch_size())) {
-  // Get or create a body
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  string key = source_key(param);
-  weak_ptr<Body>& weak = bodies_[key];
-  body_ = weak.lock();
-  if (!body_) {
-    body_.reset(new Body(param));
-    bodies_[key] = weak_ptr<Body>(body_);
-  }
-  body_->new_queue_pairs_.push(queue_pair_);
-}
-
-DataReader::~DataReader() {
-  string key = source_key(body_->param_);
-  body_.reset();
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  if (bodies_[key].expired()) {
-    bodies_.erase(key);
-  }
-}
-
-//
-
-DataReader::QueuePair::QueuePair(int size) {
-  // Initialize the free queue with requested number of datums
-  for (int i = 0; i < size; ++i) {
-    free_.push(new Datum());
-  }
-}
-
-DataReader::QueuePair::~QueuePair() {
-  Datum* datum;
-  while (free_.try_pop(&datum)) {
-    delete datum;
-  }
-  while (full_.try_pop(&datum)) {
-    delete datum;
-  }
-}
-
-//
-
-DataReader::Body::Body(const LayerParameter& param)
-    : param_(param),
-      new_queue_pairs_() {
-  StartInternalThread();
-}
-
-DataReader::Body::~Body() {
-  StopInternalThread();
-}
-
-void DataReader::Body::InternalThreadEntry() {
-  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
-  db->Open(param_.data_param().source(), db::READ);
-  shared_ptr<db::Cursor> cursor(db->NewCursor());
-  vector<shared_ptr<QueuePair> > qps;
-  try {
-    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
-
-    // To ensure deterministic runs, only start running once all solvers
-    // are ready. But solvers need to peek on one item during initialization,
-    // so read one item, then wait for the next solver.
-    for (int i = 0; i < solver_count; ++i) {
-      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
-      read_one(cursor.get(), qp.get());
-      qps.push_back(qp);
-    }
-    // Main loop
-    while (!must_stop()) {
-      for (int i = 0; i < solver_count; ++i) {
-        read_one(cursor.get(), qps[i].get());
-      }
-      // Check no additional readers have been created. This can happen if
-      // more than one net is trained at a time per process, whether single
-      // or multi solver. It might also happen if two data layers have same
-      // name and same source.
-      CHECK_EQ(new_queue_pairs_.size(), 0);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-}
-
-void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
-  Datum* datum = qp->free_.pop();
-  // TODO deserialize in-place instead of copy?
-  datum->ParseFromString(cursor->value());
-  qp->full_.push(datum);
-
-  // go to the next iter
-  cursor->Next();
-  if (!cursor->valid()) {
-    DLOG(INFO) << "Restarting data prefetching from start.";
-    cursor->SeekToFirst();
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 7189d67e289..3012251e0a5 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -130,7 +130,7 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
-  // If datum is encoded, decoded and transform the cv::image.
+  // If datum is encoded, decode and transform the cv::image.
   if (datum.encoded()) {
 #ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 104884e0295..11de4979935 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -28,25 +28,27 @@ void InternalThread::StartInternalThread() {
   Caffe::Brew mode = Caffe::mode();
   int rand_seed = caffe_rng_rand();
   int solver_count = Caffe::solver_count();
-  bool root_solver = Caffe::root_solver();
+  int solver_rank = Caffe::solver_rank();
+  bool multiprocess = Caffe::multiprocess();
 
   try {
     thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
-          rand_seed, solver_count, root_solver));
+          rand_seed, solver_count, solver_rank, multiprocess));
   } catch (std::exception& e) {
     LOG(FATAL) << "Thread exception: " << e.what();
   }
 }
 
 void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
-    int solver_count, bool root_solver) {
+    int solver_count, int solver_rank, bool multiprocess) {
 #ifndef CPU_ONLY
   CUDA_CHECK(cudaSetDevice(device));
 #endif
   Caffe::set_mode(mode);
   Caffe::set_random_seed(rand_seed);
   Caffe::set_solver_count(solver_count);
-  Caffe::set_root_solver(root_solver);
+  Caffe::set_solver_rank(solver_rank);
+  Caffe::set_multiprocess(multiprocess);
 
   InternalThreadEntry();
 }
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 3b9128986ae..684ae88bb49 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -1,27 +1,7 @@
-#include <boost/thread.hpp>
 #include "caffe/layer.hpp"
 
 namespace caffe {
 
-template <typename Dtype>
-void Layer<Dtype>::InitMutex() {
-  forward_mutex_.reset(new boost::mutex());
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Lock() {
-  if (IsShared()) {
-    forward_mutex_->lock();
-  }
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Unlock() {
-  if (IsShared()) {
-    forward_mutex_->unlock();
-  }
-}
-
 INSTANTIATE_CLASS(Layer);
 
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index e967bd6181c..f14253a510e 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -67,6 +67,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -104,6 +105,7 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -141,6 +143,7 @@ shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -164,6 +167,7 @@ shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -187,6 +191,7 @@ shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -210,6 +215,7 @@ shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -233,6 +239,7 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e009a..35c90145e31 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -19,7 +19,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_axes = bottom[0]->num_axes();
   num_spatial_axes_ = num_axes - first_spatial_axis;
   CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
   vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
   // Setup filter kernel dimensions (kernel_shape_).
   kernel_shape_.Reshape(spatial_dim_blob_shape);
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 989319f1a07..93a798f3571 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -36,9 +36,11 @@ template <typename Dtype>
 BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
     const LayerParameter& param)
     : BaseDataLayer<Dtype>(param),
-      prefetch_free_(), prefetch_full_() {
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_free_.push(&prefetch_[i]);
+      prefetch_(param.data_param().prefetch()),
+      prefetch_free_(), prefetch_full_(), prefetch_current_() {
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i].reset(new Batch<Dtype>());
+    prefetch_free_.push(prefetch_[i].get());
   }
 }
 
@@ -46,22 +48,23 @@ template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
+
   // Before starting the prefetch thread, we make cpu_data and gpu_data
   // calls so that the prefetch thread does not accidentally make simultaneous
   // cudaMalloc calls when the main thread is running. In some GPUs this
   // seems to cause failures if we do not so.
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_[i].data_.mutable_cpu_data();
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i]->data_.mutable_cpu_data();
     if (this->output_labels_) {
-      prefetch_[i].label_.mutable_cpu_data();
+      prefetch_[i]->label_.mutable_cpu_data();
     }
   }
 #ifndef CPU_ONLY
   if (Caffe::mode() == Caffe::GPU) {
-    for (int i = 0; i < PREFETCH_COUNT; ++i) {
-      prefetch_[i].data_.mutable_gpu_data();
+    for (int i = 0; i < prefetch_.size(); ++i) {
+      prefetch_[i]->data_.mutable_gpu_data();
       if (this->output_labels_) {
-        prefetch_[i].label_.mutable_gpu_data();
+        prefetch_[i]->label_.mutable_gpu_data();
       }
     }
   }
@@ -88,6 +91,9 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 #ifndef CPU_ONLY
       if (Caffe::mode() == Caffe::GPU) {
         batch->data_.data().get()->async_gpu_push(stream);
+        if (this->output_labels_) {
+          batch->label_.data().get()->async_gpu_push(stream);
+        }
         CUDA_CHECK(cudaStreamSynchronize(stream));
       }
 #endif
@@ -106,22 +112,18 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
-             top[0]->mutable_cpu_data());
-  DLOG(INFO) << "Prefetch copied";
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_cpu_data(prefetch_current_->data_.mutable_cpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
-        top[1]->mutable_cpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_cpu_data(prefetch_current_->label_.mutable_cpu_data());
   }
-
-  prefetch_free_.push(batch);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
index 4056d36a7b4..64c621a74f1 100644
--- a/src/caffe/layers/base_data_layer.cu
+++ b/src/caffe/layers/base_data_layer.cu
@@ -7,23 +7,18 @@ namespace caffe {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
-      top[0]->mutable_gpu_data());
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_gpu_data(prefetch_current_->data_.mutable_gpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
-        top[1]->mutable_gpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_gpu_data(prefetch_current_->label_.mutable_gpu_data());
   }
-  // Ensure the copy is synchronous wrt the host, so that the next batch isn't
-  // copied in meanwhile.
-  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-  prefetch_free_.push(batch);
 }
 
 INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index a69d8f99316..c6a1d5b1b2c 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -27,13 +27,25 @@ void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     sz.push_back(channels_);
     this->blobs_[0].reset(new Blob<Dtype>(sz));
     this->blobs_[1].reset(new Blob<Dtype>(sz));
-    sz[0]=1;
+    sz[0] = 1;
     this->blobs_[2].reset(new Blob<Dtype>(sz));
     for (int i = 0; i < 3; ++i) {
       caffe_set(this->blobs_[i]->count(), Dtype(0),
                 this->blobs_[i]->mutable_cpu_data());
     }
   }
+  // Mask statistics from optimization by setting local learning rates
+  // for mean, variance, and the bias correction to zero.
+  for (int i = 0; i < this->blobs_.size(); ++i) {
+    if (this->layer_param_.param_size() == i) {
+      ParamSpec* fixed_param_spec = this->layer_param_.add_param();
+      fixed_param_spec->set_lr_mult(0.f);
+    } else {
+      CHECK_EQ(this->layer_param_.param(i).lr_mult(), 0.f)
+          << "Cannot configure batch normalization statistics as layer "
+          << "parameters.";
+    }
+  }
 }
 
 template <typename Dtype>
@@ -49,7 +61,7 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   variance_.Reshape(sz);
   temp_.ReshapeLike(*bottom[0]);
   x_norm_.ReshapeLike(*bottom[0]);
-  sz[0]=bottom[0]->shape(0);
+  sz[0] = bottom[0]->shape(0);
   batch_sum_multiplier_.Reshape(sz);
 
   int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
@@ -112,8 +124,8 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_sqr<Dtype>(top[0]->count(), top_data,
+                     temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.cpu_data(),
         spatial_sum_multiplier_.cpu_data(), 0.,
@@ -136,7 +148,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-  caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+  caffe_sqrt(variance_.count(), variance_.cpu_data(),
              variance_.mutable_cpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
index c21713c81d9..a35e778e2f1 100644
--- a/src/caffe/layers/batch_norm_layer.cu
+++ b/src/caffe/layers/batch_norm_layer.cu
@@ -48,14 +48,14 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
+    caffe_gpu_mul(top[0]->count(), top[0]->gpu_data(), top[0]->gpu_data(),
         temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.gpu_data(),
         spatial_sum_multiplier_.gpu_data(), 0.,
         num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, Dtype(1.),
+        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), Dtype(0.),
         variance_.mutable_gpu_data());  // E((X_EX)^2)
 
     // compute and save moving average
@@ -72,7 +72,7 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+  caffe_gpu_sqrt(variance_.count(), variance_.gpu_data(),
       variance_.mutable_gpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index e81bdd732f3..65ea8f8b7d0 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -15,8 +15,7 @@ namespace caffe {
 template <typename Dtype>
 void CropLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  // All logic that depends only on the number of dimensions is here,
-  // the rest is in Reshape because it depends on Blob size.
+  // LayerSetup() handles the number of dimensions; Reshape() handles the sizes.
   // bottom[0] supplies the data
   // bottom[1] supplies the size
   const CropParameter& param = this->layer_param_.crop_param();
@@ -40,45 +39,48 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   int input_dim = bottom[0]->num_axes();
   const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());
 
-  // initialize all offsets to 0
-  offsets = vector<int>(input_dim, 0);
-  // initialize new shape to bottom[0]
+  // Initialize offsets to 0 and the new shape to the current shape of the data.
   vector<int> new_shape(bottom[0]->shape());
+  vector<int> offsets_shape(1, input_dim);
+  offsets.Reshape(offsets_shape);
+  int* offset_data = offsets.mutable_cpu_data();
 
-  // apply crops
+  // Determine crop offsets and the new shape post-crop.
   for (int i = 0; i < input_dim; ++i) {
     int crop_offset = 0;
-    int new_size    = bottom[0]->shape(i);
+    int new_size = bottom[0]->shape(i);
     if (i >= start_axis) {
       new_size = bottom[1]->shape(i);
-
       if (param.offset_size() == 1) {
-        // if only one crop value is supplied, crop all dimensions after axis
-        // by this crop value
+        // If only one offset is given, all crops have the same offset.
         crop_offset = param.offset(0);
       } else if (param.offset_size() > 1) {
-        // crop values specified must be equal to the number of dimensions
-        // following axis
+        // For several offsets, the number of offsets must be equal to the
+        // number of dimensions to crop, that is dimensions after the axis.
         crop_offset = param.offset(i - start_axis);
       }
+      // Check that the crop and offset are within the dimension's bounds.
+      CHECK_GE(bottom[0]->shape(i) - crop_offset, bottom[1]->shape(i))
+          << "the crop for dimension " << i << " is out-of-bounds with "
+          << "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
     }
-    // Check that the image we are cropping minus the margin is bigger
-    // than the destination image.
-    CHECK_GE(bottom[0]->shape(i) - crop_offset,
-             bottom[1]->shape(i))
-        << "invalid crop parameters in dimension: " << i;
-    // Now set new size and offsets
     new_shape[i] = new_size;
-    offsets[i] = crop_offset;
+    offset_data[i] = crop_offset;
   }
   top[0]->Reshape(new_shape);
+  // Compute strides
+  src_strides_.Reshape(offsets_shape);
+  dest_strides_.Reshape(offsets_shape);
+  for (int i = 0; i < input_dim; ++i) {
+    src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
+    dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
+  }
 }
 
-// recursive copy function
 template <typename Dtype>
 void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
              const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
+             const int* offsets,
              vector<int> indices,
              int cur_dim,
              const Dtype* src_data,
@@ -92,28 +94,26 @@ void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
                 src_data, dest_data, is_forward);
     }
   } else {
-    // We are at the last dimensions, which is stored continously in memory
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      // prepare index vector reduced(red) and with offsets(off)
-      std::vector<int> ind_red(cur_dim, 0);
-      std::vector<int> ind_off(cur_dim+1, 0);
-      for (int j = 0; j < cur_dim; ++j) {
-          ind_red[j] = indices[j];
-          ind_off[j] = indices[j] + offsets[j];
-      }
-      ind_off[cur_dim] = offsets[cur_dim];
-      // do the copy
-      if (is_forward) {
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + bottom[0]->offset(ind_off),
-            dest_data + top[0]->offset(ind_red));
-      } else {
-        // in the backwards pass the src_data is top_diff
-        // and the dest_data is bottom_diff
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + top[0]->offset(ind_red),
-            dest_data + bottom[0]->offset(ind_off));
-      }
+    // We are at the last dimensions, which is stored continuously in memory
+    // prepare index vector reduced(red) and with offsets(off)
+    std::vector<int> ind_red(cur_dim, 0);
+    std::vector<int> ind_off(cur_dim+1, 0);
+    for (int j = 0; j < cur_dim; ++j) {
+      ind_red[j] = indices[j];
+      ind_off[j] = indices[j] + offsets[j];
+    }
+    ind_off[cur_dim] = offsets[cur_dim];
+    // do the copy
+    if (is_forward) {
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + bottom[0]->offset(ind_off),
+          dest_data + top[0]->offset(ind_red));
+    } else {
+      // in the backwards pass the src_data is top_diff
+      // and the dest_data is bottom_diff
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + top[0]->offset(ind_red),
+          dest_data + bottom[0]->offset(ind_off));
     }
   }
 }
@@ -124,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
+      true);
 }
 
 template <typename Dtype>
@@ -136,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (propagate_down[0]) {
     caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
     std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
+    crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
+        bottom_diff, false);
   }
 }
 
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 9ed8f7cce57..4ece9cd1761 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -4,105 +4,63 @@
 
 namespace caffe {
 
-// Copy (one line per thread) from one array to another, with arbitrary
-// strides in the last two dimensions.
+__device__ int compute_uncropped_index(
+    int index,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets) {
+  int dest_index = index;
+  int src_index = 0;
+  for (int i = 0; i < ndims; ++i) {
+      int coord = dest_index / dest_strides[i];
+      dest_index -= coord * dest_strides[i];
+      src_index += src_strides[i] * (coord + offsets[i]);
+  }
+  return src_index;
+}
+
 template <typename Dtype>
-__global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_outer_stride, const int src_inner_stride,
-    const int dest_outer_stride, const int dest_inner_stride,
+__global__ void crop_kernel_forward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
     const Dtype* src, Dtype* dest) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index / height * src_outer_stride
-                  + index % height * src_inner_stride;
-    int dest_start = index / height * dest_outer_stride
-                   + index % height * dest_inner_stride;
-    for (int i = 0; i < width; ++i) {
-      dest[dest_start + i] = src[src_start + i];
-    }
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    dest[index] = src[src_index];
   }
 }
 
-// recursive copy function, this function is similar to crop_copy but loops
-// over all but the last two dimensions. It is implemented this way to allow
-// for ND cropping while still relying on a CUDA kernel for the innermost
-// two dimensions for performance reasons.
-// An alternative way to implement ND cropping relying more on the kernel
-// would require passing offsets to the kernel, which is a bit problematic
-// because it is of variable length. Since in the standard (N,C,W,H) case
-// N,C are usually not cropped a speedup could be achieved by not looping
-// the application of the copy_kernel around these dimensions.
 template <typename Dtype>
-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
-             const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
-             vector<int> indices,
-             int cur_dim,
-             const Dtype* src_data,
-             Dtype* dest_data,
-             bool is_forward) {
-  if (cur_dim + 2 < top[0]->num_axes()) {
-    // We are not yet at the final dimension, call copy recursivley
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      indices[cur_dim] = i;
-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
-                src_data, dest_data, is_forward);
-    }
-  } else {
-    // We are at the last two dimensions, which are stored continously in memory
-    // With (N,C,H,W)
-    //      (0,1,2,3) cur_dim   -> H
-    //                cur_dim+1 -> W
-    const int lines = top[0]->shape(cur_dim);
-    const int height = top[0]->shape(cur_dim);
-    const int width = top[0]->shape(cur_dim+1);
-    std::vector<int> ind_off(cur_dim+2, 0);
-    for (int j = 0; j < cur_dim; ++j) {
-        ind_off[j] = indices[j] + offsets[j];
-    }
-    ind_off[cur_dim] = offsets[cur_dim];
-    ind_off[cur_dim+1] = offsets[cur_dim+1];
-    // Compute copy strides
-    const int src_outer_stride =
-        bottom[0]->shape(cur_dim)*bottom[0]->shape(cur_dim+1);
-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_outer_stride =
-        top[0]->shape(cur_dim)*top[0]->shape(cur_dim+1);
-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
-
-    if (is_forward) {
-      const Dtype* bottom_data = bottom[0]->gpu_data() +
-          bottom[0]->offset(ind_off);
-      Dtype* top_data = top[0]->mutable_gpu_data() +
-          top[0]->offset(indices);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          src_outer_stride, src_inner_stride,
-          dest_outer_stride, dest_inner_stride,
-          bottom_data, top_data);
-
-    } else {
-      const Dtype* top_diff = top[0]->gpu_diff() +
-          top[0]->offset(indices);
-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
-          bottom[0]->offset(ind_off);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          dest_outer_stride, dest_inner_stride,
-          src_outer_stride, src_inner_stride,
-          top_diff, bottom_diff);
-    }
+__global__ void crop_kernel_backward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
+    Dtype* src, const Dtype* dest) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    src[src_index] = dest[index];
   }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  int n = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+      bottom[0]->num_axes(),
+      src_strides_.gpu_data(),
+      dest_strides_.gpu_data(),
+      offsets.gpu_data(),
+      bottom_data, top_data);
 }
 
 template <typename Dtype>
@@ -110,12 +68,17 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int n = top[0]->count();
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
-    std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
-                  false);
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+        bottom[0]->num_axes(),
+        src_strides_.gpu_data(),
+        dest_strides_.gpu_data(),
+        offsets.gpu_data(),
+        bottom_diff, top_diff);
   }
 }
 
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1987fb096b0..efc9e04e8c0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -252,6 +252,7 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
+  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 42c4fd0260c..8bc5346248c 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -30,19 +30,11 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
       // Bias.
       if (this->bias_term_) {
         const Dtype* bias_data = this->blobs_[1]->gpu_data();
-#if CUDNN_VERSION_MIN(4, 0, 0)
         CUDNN_CHECK(cudnnAddTensor(handle_[g],
               cudnn::dataType<Dtype>::one,
               bias_desc_, bias_data + bias_offset_ * g,
               cudnn::dataType<Dtype>::one,
               top_descs_[i], top_data + top_offset_ * g));
-#else
-        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-#endif
       }
     }
 
@@ -82,7 +74,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       // Gradient w.r.t. weights.
       if (this->param_propagate_down_[0]) {
         const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3(
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
               handle_[1*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               bottom_descs_[i], bottom_data + bottom_offset_ * g,
@@ -100,7 +92,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           weight = this->blobs_[0]->gpu_data();
         }
         Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData_v3(
+        CUDNN_CHECK(cudnnConvolutionBackwardData(
               handle_[2*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               filter_desc_, weight + this->weight_offset_ * g,
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index c86c6907113..687c905763e 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -13,6 +13,7 @@ void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_RELU);
   handles_setup_ = true;
 }
 
@@ -35,6 +36,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroyActivationDescriptor(this->activ_desc_);
   cudnnDestroy(this->handle_);
 }
 
diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu
index 9f617183baa..e7928bbd6e0 100644
--- a/src/caffe/layers/cudnn_relu_layer.cu
+++ b/src/caffe/layers/cudnn_relu_layer.cu
@@ -15,12 +15,21 @@ void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }
 
 template <typename Dtype>
@@ -40,13 +49,23 @@ void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
index ccb955cdaff..3ce6aef1764 100644
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cpp
@@ -13,6 +13,8 @@ void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_,
+      CUDNN_ACTIVATION_SIGMOID);
   handles_setup_ = true;
 }
 
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu
index e2a4b460c6c..48d6cbab6de 100644
--- a/src/caffe/layers/cudnn_sigmoid_layer.cu
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cu
@@ -10,12 +10,21 @@ void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }
 
 template <typename Dtype>
@@ -30,13 +39,23 @@ void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
index 1a56418227c..e87dd9de0ab 100644
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ b/src/caffe/layers/cudnn_tanh_layer.cpp
@@ -13,6 +13,7 @@ void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH);
   handles_setup_ = true;
 }
 
diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu
index 89df28a3e8b..6b5d7ae7ea7 100644
--- a/src/caffe/layers/cudnn_tanh_layer.cu
+++ b/src/caffe/layers/cudnn_tanh_layer.cu
@@ -10,12 +10,21 @@ void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }
 
 template <typename Dtype>
@@ -31,13 +40,23 @@ void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
+        activ_desc_,
         cudnn::dataType<Dtype>::one,
         this->top_desc_, top_data, this->top_desc_, top_diff,
         this->bottom_desc_, bottom_data,
         cudnn::dataType<Dtype>::zero,
         this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 66e6301fd45..0f1296bbc77 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -14,7 +14,10 @@ namespace caffe {
 template <typename Dtype>
 DataLayer<Dtype>::DataLayer(const LayerParameter& param)
   : BasePrefetchingDataLayer<Dtype>(param),
-    reader_(param) {
+    offset_() {
+  db_.reset(db::GetDB(param.data_param().backend()));
+  db_->Open(param.data_param().source(), db::READ);
+  cursor_.reset(db_->NewCursor());
 }
 
 template <typename Dtype>
@@ -27,7 +30,8 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.data_param().batch_size();
   // Read a data point, and use it to initialize the top blob.
-  Datum& datum = *(reader_.full().peek());
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
 
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
@@ -35,22 +39,44 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Reshape top[0] and prefetch_data according to the batch_size.
   top_shape[0] = batch_size;
   top[0]->Reshape(top_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
+  LOG_IF(INFO, Caffe::root_solver())
+      << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
   if (this->output_labels_) {
     vector<int> label_shape(1, batch_size);
     top[1]->Reshape(label_shape);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].label_.Reshape(label_shape);
+    for (int i = 0; i < this->prefetch_.size(); ++i) {
+      this->prefetch_[i]->label_.Reshape(label_shape);
     }
   }
 }
 
+template <typename Dtype>
+bool DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void DataLayer<Dtype>::Next() {
+  cursor_->Next();
+  if (!cursor_->valid()) {
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Restarting data prefetching from start.";
+    cursor_->SeekToFirst();
+  }
+  offset_++;
+}
+
 // This function is called on prefetch thread
 template<typename Dtype>
 void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
@@ -61,41 +87,41 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   CPUTimer timer;
   CHECK(batch->data_.count());
   CHECK(this->transformed_data_.count());
-
-  // Reshape according to the first datum of each batch
-  // on single input batches allows for inputs of varying dimension.
   const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum& datum = *(reader_.full().peek());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape batch according to the batch_size.
-  top_shape[0] = batch_size;
-  batch->data_.Reshape(top_shape);
-
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
-  if (this->output_labels_) {
-    top_label = batch->label_.mutable_cpu_data();
-  }
+  Datum datum;
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     timer.Start();
-    // get a datum
-    Datum& datum = *(reader_.full().pop("Waiting for data"));
+    while (Skip()) {
+      Next();
+    }
+    datum.ParseFromString(cursor_->value());
     read_time += timer.MicroSeconds();
-    timer.Start();
+
+    if (item_id == 0) {
+      // Reshape according to the first datum of each batch
+      // on single input batches allows for inputs of varying dimension.
+      // Use data_transformer to infer the expected blob shape from datum.
+      vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+      this->transformed_data_.Reshape(top_shape);
+      // Reshape batch according to the batch_size.
+      top_shape[0] = batch_size;
+      batch->data_.Reshape(top_shape);
+    }
+
     // Apply data transformations (mirror, scale, crop...)
+    timer.Start();
     int offset = batch->data_.offset(item_id);
+    Dtype* top_data = batch->data_.mutable_cpu_data();
     this->transformed_data_.set_cpu_data(top_data + offset);
     this->data_transformer_->Transform(datum, &(this->transformed_data_));
     // Copy label.
     if (this->output_labels_) {
+      Dtype* top_label = batch->label_.mutable_cpu_data();
       top_label[item_id] = datum.label();
     }
     trans_time += timer.MicroSeconds();
-
-    reader_.free().push(const_cast<Datum*>(&datum));
+    Next();
   }
   timer.Stop();
   batch_timer.Stop();
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 21256166bfa..3d82b0e1cbf 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,7 +31,9 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    CHECK(bottom[0]->shape() == bottom[i]->shape())
+        << "bottom[0]: " << bottom[0]->shape_string()
+        << ", bottom[" << i << "]: " << bottom[i]->shape_string();
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 1f4a309fe25..0c1b463ae12 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -23,7 +23,8 @@ void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const Dtype input_scale = this->layer_param_.exp_param().scale();
   const Dtype input_shift = this->layer_param_.exp_param().shift();
   inner_scale_ = log_base * input_scale;
-  outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
+  outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) :
+     ( (base != Dtype(-1)) ? pow(base, input_shift) : exp(input_shift) );
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 2f13dc641df..00716a92b15 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -39,8 +39,9 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 
   for (int i = 0; i < top_size; ++i) {
     hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
+    // Allow reshape here, as we are loading data not params
     hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get(), true);
   }
 
   herr_t status = H5Fclose(file_id);
@@ -61,10 +62,10 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   // Shuffle if needed.
   if (this->layer_param_.hdf5_data_param().shuffle()) {
     std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0)
                << " rows (shuffled)";
   } else {
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0) << " rows";
   }
 }
 
@@ -124,28 +125,46 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
+template <typename Dtype>
+bool HDF5DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void HDF5DataLayer<Dtype>::Next() {
+  if (++current_row_ == hdf_blobs_[0]->shape(0)) {
+    if (num_files_ > 1) {
+      ++current_file_;
+      if (current_file_ == num_files_) {
+        current_file_ = 0;
+        if (this->layer_param_.hdf5_data_param().shuffle()) {
+          std::random_shuffle(file_permutation_.begin(),
+                              file_permutation_.end());
+        }
+        DLOG(INFO) << "Looping around to first file.";
+      }
+      LoadHDF5FileData(
+        hdf_filenames_[file_permutation_[current_file_]].c_str());
+    }
+    current_row_ = 0;
+    if (this->layer_param_.hdf5_data_param().shuffle())
+      std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  }
+  offset_++;
+}
+
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        ++current_file_;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -153,6 +172,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 595d2230220..33eebd41dfc 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -17,24 +17,9 @@ template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -42,6 +27,7 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 62fda4accce..ec0fc5b0383 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -37,18 +37,28 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const string& source = this->layer_param_.image_data_param().source();
   LOG(INFO) << "Opening file " << source;
   std::ifstream infile(source.c_str());
-  string filename;
+  string line;
+  size_t pos;
   int label;
-  while (infile >> filename >> label) {
-    lines_.push_back(std::make_pair(filename, label));
+  while (std::getline(infile, line)) {
+    pos = line.find_last_of(' ');
+    label = atoi(line.substr(pos + 1).c_str());
+    lines_.push_back(std::make_pair(line.substr(0, pos), label));
   }
 
+  CHECK(!lines_.empty()) << "File is empty";
+
   if (this->layer_param_.image_data_param().shuffle()) {
     // randomly shuffle data
     LOG(INFO) << "Shuffling data";
     const unsigned int prefetch_rng_seed = caffe_rng_rand();
     prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
     ShuffleImages();
+  } else {
+    if (this->phase_ == TRAIN && Caffe::solver_rank() > 0 &&
+        this->layer_param_.image_data_param().rand_skip() == 0) {
+      LOG(WARNING) << "Shuffling or skipping recommended for multi-GPU";
+    }
   }
   LOG(INFO) << "A total of " << lines_.size() << " images.";
 
@@ -72,8 +82,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int batch_size = this->layer_param_.image_data_param().batch_size();
   CHECK_GT(batch_size, 0) << "Positive batch size required";
   top_shape[0] = batch_size;
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
   top[0]->Reshape(top_shape);
 
@@ -83,8 +93,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 }
 
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 624d3118124..3c3f460ec34 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -3,7 +3,8 @@
 #include <vector>
 
 #include "caffe/layers/infogain_loss_layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/io.hpp"  // for bolb reading of matrix H
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -11,6 +12,31 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // internal softmax layer
+  LayerParameter softmax_layer_param(this->layer_param_);
+  SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();
+  softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());
+  softmax_layer_param.set_type("Softmax");
+  softmax_layer_param.clear_loss_weight();
+  softmax_layer_param.add_loss_weight(1);
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  // ignore label
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  // normalization
+  CHECK(!this->layer_param_.loss_param().has_normalize())
+    << "normalize is deprecated. use \"normalization\"";
+  normalization_ = this->layer_param_.loss_param().normalization();
+  // matrix H
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
@@ -25,28 +51,86 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  infogain_axis_ =
+    bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.infogain_loss_param().axis());
+  outer_num_ = bottom[0]->count(0, infogain_axis_);
+  inner_num_ = bottom[0]->count(infogain_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  num_labels_ = bottom[0]->shape(infogain_axis_);
   Blob<Dtype>* infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
     infogain = bottom[2];
   }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+  CHECK_EQ(infogain->count(), num_labels_*num_labels_);
+  sum_rows_H_.Reshape(vector<int>(1, num_labels_));
+  if (bottom.size() == 2) {
+    // H is provided as a parameter and will not change. sum rows once
+    sum_rows_of_H(infogain);
+  }
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+}
+
+template <typename Dtype>
+Dtype InfogainLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
 }
 
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {
+  CHECK_EQ(H->count(), num_labels_*num_labels_)
+    << "H must be " << num_labels_ << "x" << num_labels_;
+  const Dtype* infogain_mat = H->cpu_data();
+  Dtype* sum = sum_rows_H_.mutable_cpu_data();
+  for ( int row = 0; row < num_labels_ ; row++ ) {
+    sum[row] = 0;
+    for ( int col = 0; col < num_labels_ ; col++ ) {
+      sum[row] += infogain_mat[row*num_labels_+col];
+    }
+  }
+}
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const Dtype* infogain_mat = NULL;
   if (bottom.size() < 3) {
@@ -54,17 +138,30 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   } else {
     infogain_mat = bottom[2]->cpu_data();
   }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value =
+        static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels_);
+      for (int l = 0; l < num_labels_; l++) {
+        loss -= infogain_mat[label_value * num_labels_ + l] *
+          log(std::max(
+                prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
+                Dtype(kLOG_THRESHOLD)));
+      }
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
@@ -80,25 +177,44 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* prob_data = prob_.cpu_data();
     const Dtype* bottom_label = bottom[1]->cpu_data();
     const Dtype* infogain_mat = NULL;
     if (bottom.size() < 3) {
       infogain_mat = infogain_.cpu_data();
     } else {
       infogain_mat = bottom[2]->cpu_data();
+      // H is provided as a "bottom" and might change. sum rows every time.
+      sum_rows_of_H(bottom[2]);
     }
+    const Dtype* sum_rows_H = sum_rows_H_.cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+    const int dim = bottom[0]->count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value =
+          static_cast<int>(bottom_label[i * inner_num_ + j]);
+        DCHECK_GE(label_value, 0);
+        DCHECK_LT(label_value, num_labels_);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] = 0;
+          }
+        } else {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] =
+               prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
+               - infogain_mat[label_value * num_labels_ + l];
+          }
+          ++count;
+        }
       }
     }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index c0b7a862181..afb1ce94893 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -16,8 +16,8 @@ void LossLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void LossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-      << "The data and label should have the same number.";
+  CHECK_EQ(bottom[0]->shape(0), bottom[1]->shape(0))
+      << "The data and label should have the same first dimension.";
   vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
   top[0]->Reshape(loss_shape);
 }
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 210525e20f3..f7520f1408f 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -14,6 +14,7 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   alpha_ = this->layer_param_.lrn_param().alpha();
   beta_ = this->layer_param_.lrn_param().beta();
   k_ = this->layer_param_.lrn_param().k();
+  deconv_ignore_ = this->layer_param_.lrn_param().deconv_ignore();
   if (this->layer_param_.lrn_param().norm_region() ==
       LRNParameter_NormRegion_WITHIN_CHANNEL) {
     // Set up split_layer_ to use inputs in the numerator and denominator.
@@ -246,10 +247,36 @@ void LRNLayer<Dtype>::WithinChannelBackward(
   }
 }
 
+template <typename Dtype>
+void LRNLayer<Dtype>::Deconv_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+  if (deconv_ignore_) {
+    // Deconv Option 1: pass through (ignore LRN layer):
+    Deconv_passthrough_cpu(top, propagate_down, bottom, deconv_type);
+  } else {
+    // Deconv Option 2: compute derivatives via backprop:
+    Backward_cpu(top, propagate_down, bottom);
+  }
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Deconv_passthrough_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i];
+    }
+  }  
+}
+
 #ifdef CPU_ONLY
-STUB_GPU(LRNLayer);
+STUB_GPU_WITH_DECONV(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
 STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
+STUB_GPU_DECONV(LRNLayer, Deconv_passthrough);
 #endif
 
 INSTANTIATE_CLASS(LRNLayer);
diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu
index 26e619c7569..614af2e23db 100644
--- a/src/caffe/layers/lrn_layer.cu
+++ b/src/caffe/layers/lrn_layer.cu
@@ -195,8 +195,50 @@ template void LRNLayer<double>::CrossChannelBackward_gpu(
     const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
     const vector<Blob<double>*>& bottom);
 
+template <typename Dtype>
+void LRNLayer<Dtype>::Deconv_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+  if (deconv_ignore_) {
+    // Deconv Option 1: pass through (ignore LRN layer):
+    Deconv_passthrough_gpu(top, propagate_down, bottom, deconv_type);
+  } else {
+    // Deconv Option 2: compute derivatives via backprop:
+    Backward_gpu(top, propagate_down, bottom);
+  }
+}
+
+template <typename Dtype>
+__global__ void LRNDeconv_passthrough(const int n, const Dtype* in_diff,
+    Dtype* out_diff) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = in_diff[index];
+  }
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Deconv_passthrough_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+  // Option 2: pass through (ignore LRN layer)
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    LRNDeconv_passthrough<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, bottom_diff);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+template void LRNLayer<float>::Deconv_passthrough_gpu(
+    const vector<Blob<float>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<float>*>& bottom, int deconv_type);
+template void LRNLayer<double>::Deconv_passthrough_gpu(
+    const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<double>*>& bottom, int deconv_type);
+
 
 
-INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
+//INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
+INSTANTIATE_LAYER_GPU_FUNCS_WITH_DECONV(LRNLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp
new file mode 100644
index 00000000000..da48dba4c05
--- /dev/null
+++ b/src/caffe/layers/lstm_layer.cpp
@@ -0,0 +1,244 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + format_int(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  const int num_blobs = 2;
+  shapes->resize(num_blobs);
+  for (int i = 0; i < num_blobs; ++i) {
+    (*shapes)[i].Clear();
+    (*shapes)[i].add_dim(1);  // a single timestep
+    (*shapes)[i].add_dim(this->N_);
+    (*shapes)[i].add_dim(num_output);
+  }
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(2, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+
+  input_layer_param->add_top("c_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[1]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xc_x_static_reshape");
+    reshape_param->add_bottom("W_xc_x_static_preshape");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + format_int(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
new file mode 100644
index 00000000000..d1ab59c4bd1
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -0,0 +1,130 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num_instances = bottom[0]->shape(1);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (i == 2) {
+      CHECK_EQ(2, bottom[i]->num_axes());
+    } else {
+      CHECK_EQ(3, bottom[i]->num_axes());
+    }
+    CHECK_EQ(1, bottom[i]->shape(0));
+    CHECK_EQ(num_instances, bottom[i]->shape(1));
+  }
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_.ReshapeLike(*bottom[1]);
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* cont = bottom[2]->cpu_data();
+  Dtype* C = top[0]->mutable_cpu_data();
+  Dtype* H = top[1]->mutable_cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = f * c_prev + i * g;
+      C[d] = c;
+      const Dtype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++cont;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* cont = bottom[2]->cpu_data();
+  const Dtype* C = top[0]->cpu_data();
+  const Dtype* H = top[1]->cpu_data();
+  const Dtype* C_diff = top[0]->cpu_diff();
+  const Dtype* H_diff = top[1]->cpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* X_diff = bottom[1]->mutable_cpu_diff();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = C[d];
+      const Dtype tanh_c = tanh(c);
+      Dtype* c_prev_diff = C_prev_diff + d;
+      Dtype* i_diff = X_diff + d;
+      Dtype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Dtype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Dtype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Dtype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++cont;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LSTMUnitLayer);
+#endif
+
+INSTANTIATE_CLASS(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
new file mode 100644
index 00000000000..15bb451d9e0
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cu
@@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* cont,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = cont[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X = bottom[1]->gpu_data();
+  const Dtype* cont = bottom[2]->gpu_data();
+  Dtype* X_acts = X_acts_.mutable_gpu_data();
+  Dtype* C = top[0]->mutable_gpu_data();
+  Dtype* H = top[1]->mutable_gpu_data();
+  const int X_count = bottom[1]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, hidden_dim_, C_prev, X_acts, cont, C, H);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* cont, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype cont_n = cont[n];
+    *c_prev_diff = cont_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = cont_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X_acts = X_acts_.gpu_data();
+  const Dtype* cont = bottom[2]->gpu_data();
+  const Dtype* C = top[0]->gpu_data();
+  const Dtype* H = top[1]->gpu_data();
+  const Dtype* C_diff = top[0]->gpu_diff();
+  const Dtype* H_diff = top[1]->gpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
+  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, cont, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  const int X_count = bottom[1]->count();
+  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
+  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 82909874054..975f4841723 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -107,7 +107,7 @@ void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
+  CHECK(data_) << "MemoryDataLayer needs to be initialized by calling Reset";
   top[0]->Reshape(batch_size_, channels_, height_, width_);
   top[1]->Reshape(batch_size_, 1, 1, 1);
   top[0]->set_cpu_data(data_ + pos_ * size_);
diff --git a/src/caffe/layers/parameter_layer.cpp b/src/caffe/layers/parameter_layer.cpp
new file mode 100644
index 00000000000..fbd326f8469
--- /dev/null
+++ b/src/caffe/layers/parameter_layer.cpp
@@ -0,0 +1,8 @@
+#include "caffe/layers/parameter_layer.hpp"
+
+namespace caffe {
+
+INSTANTIATE_CLASS(ParameterLayer);
+REGISTER_LAYER_CLASS(Parameter);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 1ea46cc81b1..46eddb94924 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -138,7 +138,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
+    Dtype cumsum = 0.;
     Dtype cumvalues = 0.;
     const Dtype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;
@@ -149,7 +149,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
         cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
+    top_data[index] = (cumsum > 0.) ? cumvalues / cumsum : 0.;
   }
 }
 
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
new file mode 100644
index 00000000000..e0c82773392
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -0,0 +1,295 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If expose_hidden is set, we take as input and produce as output
+  // the hidden state blobs at the first and last timesteps.
+  expose_hidden_ = this->layer_param_.recurrent_param().expose_hidden();
+
+  // Get (recurrent) input/output names.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  const int num_hidden_exposed = expose_hidden_ * num_recur_blobs;
+  static_input_ = (bottom.size() > 2 + num_hidden_exposed);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+
+  LayerParameter* input_layer_param = net_param.add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  input_layer_param->add_top("cont");
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    input_layer_param->add_top("x_static");
+    input_param->add_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size()) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Add "pseudo-losses" to all outputs to force backpropagation.
+  // (Setting force_backward is too aggressive as we may not need to backprop to
+  // all inputs, e.g., the sequence continuation indicators.)
+  vector<string> pseudo_losses(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    LayerParameter* layer = net_param.add_layer();
+    pseudo_losses[i] = output_names[i] + "_pseudoloss";
+    layer->set_name(pseudo_losses[i]);
+    layer->set_type("Reduction");
+    layer->add_bottom(output_names[i]);
+    layer->add_top(pseudo_losses[i]);
+    layer->add_loss_weight(1);
+  }
+
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net<Dtype>(net_param));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  CHECK_EQ(top.size() - num_hidden_exposed, output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff());
+  }
+
+  // Check that the last output_names.size() layers are the pseudo-losses;
+  // set last_layer_index so that we don't actually run these layers.
+  const vector<string>& layer_names = unrolled_net_->layer_names();
+  last_layer_index_ = layer_names.size() - 1 - pseudo_losses.size();
+  for (int i = last_layer_index_ + 1, j = 0; i < layer_names.size(); ++i, ++j) {
+    CHECK_EQ(layer_names[i], pseudo_losses[j]);
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  CHECK_EQ(T_, bottom[0]->shape(0)) << "input number of timesteps changed";
+  N_ = bottom[0]->shape(1);
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+  x_input_blob_->ReshapeLike(*bottom[0]);
+  vector<int> cont_shape = bottom[1]->shape();
+  cont_input_blob_->Reshape(cont_shape);
+  if (static_input_) {
+    x_static_input_blob_->ReshapeLike(*bottom[2]);
+  }
+  vector<BlobShape> recur_input_shapes;
+  RecurrentInputShapes(&recur_input_shapes);
+  CHECK_EQ(recur_input_shapes.size(), recur_input_blobs_.size());
+  for (int i = 0; i < recur_input_shapes.size(); ++i) {
+    recur_input_blobs_[i]->Reshape(recur_input_shapes[i]);
+  }
+  unrolled_net_->Reshape();
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+  if (expose_hidden_) {
+    const int bottom_offset = 2 + static_input_;
+    for (int i = bottom_offset, j = 0; i < bottom.size(); ++i, ++j) {
+      CHECK(recur_input_blobs_[j]->shape() == bottom[i]->shape())
+          << "bottom[" << i << "] shape must match hidden state input shape: "
+          << recur_input_blobs_[j]->shape_string();
+      recur_input_blobs_[j]->ShareData(*bottom[i]);
+    }
+  }
+  for (int i = 0; i < output_blobs_.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    top[i]->ShareData(*output_blobs_[i]);
+    top[i]->ShareDiff(*output_blobs_[i]);
+  }
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ReshapeLike(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time: reshare all the internal shared blobs, which may
+  // currently point to a stale owner blob that was dropped when Solver::Test
+  // called test_net->ShareTrainedLayersWith(net_.get()).
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data();
+      Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+
+  // TODO: skip backpropagation to inputs and parameters inside the unrolled
+  // net according to propagate_down[0] and propagate_down[2]. For now just
+  // backprop to inputs and parameters unconditionally, as either the inputs or
+  // the parameters do need backward (or Net would have set
+  // layer_needs_backward_[i] == false for this layer).
+  unrolled_net_->BackwardFrom(last_layer_index_);
+}
+
+#ifdef CPU_ONLY
+STUB_GPU_FORWARD(RecurrentLayer, Forward);
+#endif
+
+INSTANTIATE_CLASS(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
new file mode 100644
index 00000000000..4dd2b0e2165
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cu
@@ -0,0 +1,44 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
+      Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 92a729c81bd..6858be0828d 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -35,9 +35,38 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ReLULayer<Dtype>::Deconv_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    if (negative_slope != Dtype(0))
+      LOG(WARNING) << "negative_slope parameter = " << negative_slope << " but nonzero negative_slope params are not supported for Deconv through RELU.";
+
+    // Zeiler & Fergus deconv
+    if (deconv_type == 0) {
+      for (int i = 0; i < count; ++i) {
+        bottom_diff[i] = std::max(top_diff[i], Dtype(0));
+      }
+    }
+    // "guided backprop" deconv
+    else if (deconv_type == 1) {
+      for (int i = 0; i < count; ++i) {
+        bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) + negative_slope * (bottom_data[i] <= 0)) * (top_diff[i] > 0);
+      }
+    }
+  }
+}
+
 
 #ifdef CPU_ONLY
-STUB_GPU(ReLULayer);
+STUB_GPU_WITH_DECONV(ReLULayer);
 #endif
 
 INSTANTIATE_CLASS(ReLULayer);
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu
index 4bf15b3aad3..61d1251e665 100644
--- a/src/caffe/layers/relu_layer.cu
+++ b/src/caffe/layers/relu_layer.cu
@@ -57,8 +57,57 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+__global__ void ReLUDeconv_ZF(const int n, const Dtype* in_diff,
+    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // Zeiler & Fergus deconv
+    out_diff[index] = in_diff[index] > 0 ? in_diff[index] : 0;
+  }
+}
+
+template <typename Dtype>
+__global__ void ReLUDeconv_GB(const int n, const Dtype* in_diff,
+    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // "guided backprop" deconv
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope) * (in_diff[index] > 0);
+  }
+}
+
+template <typename Dtype>
+void ReLULayer<Dtype>::Deconv_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom, int deconv_type) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    if (negative_slope != Dtype(0))
+      LOG(WARNING) << "negative_slope parameter = " << negative_slope << " but nonzero negative_slope params are not supported for Deconv through RELU.";
+
+    // Zeiler & Fergus deconv
+    if (deconv_type == 0) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      ReLUDeconv_ZF<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, top_diff, bottom_data, bottom_diff, negative_slope);
+    }
+    // "guided backprop" deconv
+    else if (deconv_type == 1) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      ReLUDeconv_GB<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, top_diff, bottom_data, bottom_diff, negative_slope);
+    }
+
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
 
-INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
+//INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
+INSTANTIATE_LAYER_GPU_FUNCS_WITH_DECONV(ReLULayer);
 
 
 }  // namespace caffe
diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
new file mode 100644
index 00000000000..8c2fa22e598
--- /dev/null
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -0,0 +1,236 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + format_int(this->T_);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  shapes->resize(1);
+  (*shapes)[0].Clear();
+  (*shapes)[0].add_dim(1);  // a single timestep
+  (*shapes)[0].add_dim(this->N_);
+  (*shapes)[0].add_dim(num_output);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(1, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static_preshape");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho * h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
index ecdbb123e31..e652dad6e10 100644
--- a/src/caffe/layers/scale_layer.cpp
+++ b/src/caffe/layers/scale_layer.cpp
@@ -56,9 +56,17 @@ void ScaleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     bias_bottom_vec_.resize(1);
     bias_bottom_vec_[0] = bottom[0];
     bias_layer_->SetUp(bias_bottom_vec_, top);
-    bias_param_id_ = this->blobs_.size();
-    this->blobs_.resize(bias_param_id_ + 1);
-    this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    if (this->blobs_.size() + bottom.size() < 3) {
+      // case: blobs.size == 1 && bottom.size == 1
+      // or blobs.size == 0 && bottom.size == 2
+      bias_param_id_ = this->blobs_.size();
+      this->blobs_.resize(bias_param_id_ + 1);
+      this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    } else {
+      // bias param already initialized
+      bias_param_id_ = this->blobs_.size() - 1;
+      bias_layer_->blobs()[0] = this->blobs_[bias_param_id_];
+    }
     bias_propagate_down_.resize(1, false);
   }
   this->param_propagate_down_.resize(this->blobs_.size(), true);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 10ac9470832..99fa3eb645a 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "caffe/layers/sigmoid_cross_entropy_loss_layer.hpp"
@@ -14,17 +15,66 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
   sigmoid_top_vec_.clear();
   sigmoid_top_vec_.push_back(sigmoid_output_.get());
   sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  if (this->layer_param_.loss_param().has_normalization()) {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  } else if (this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+                     LossParameter_NormalizationMode_VALID :
+                     LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = LossParameter_NormalizationMode_BATCH_SIZE;
+  }
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  outer_num_ = bottom[0]->shape(0);  // batch size
+  inner_num_ = bottom[0]->count(1);  // instance size: |output| == |target|
   CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
       "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
   sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
+// TODO(shelhamer) loss normalization should be pulled up into LossLayer,
+// instead of duplicated here and in SoftMaxWithLossLayer
+template <typename Dtype>
+Dtype SigmoidCrossEntropyLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -32,17 +82,22 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
   sigmoid_bottom_vec_[0] = bottom[0];
   sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
   // Compute the loss (negative log likelihood)
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
   // Stable version of loss computation from input data
   const Dtype* input_data = bottom[0]->cpu_data();
   const Dtype* target = bottom[1]->cpu_data();
+  int valid_count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < count; ++i) {
+  for (int i = 0; i < bottom[0]->count(); ++i) {
+    const int target_value = static_cast<int>(target[i]);
+    if (has_ignore_label_ && target_value == ignore_label_) {
+      continue;
+    }
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+    ++valid_count;
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  normalizer_ = get_normalizer(normalization_, valid_count);
+  top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 }
 
 template <typename Dtype>
@@ -56,19 +111,27 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   if (propagate_down[0]) {
     // First, compute the diff
     const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
     const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
     const Dtype* target = bottom[1]->cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
     caffe_sub(count, sigmoid_output_data, target, bottom_diff);
+    // Zero out gradient of ignored targets.
+    if (has_ignore_label_) {
+      for (int i = 0; i < count; ++i) {
+        const int target_value = static_cast<int>(target[i]);
+        if (target_value == ignore_label_) {
+          bottom_diff[i] = 0;
+        }
+      }
+    }
     // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_scal(count, loss_weight / num, bottom_diff);
+    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+    caffe_scal(count, loss_weight, bottom_diff);
   }
 }
 
 #ifdef CPU_ONLY
-STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
+STUB_GPU(SigmoidCrossEntropyLossLayer);
 #endif
 
 INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 046cb9d3a31..b9877e6a3f6 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -5,6 +5,72 @@
 
 namespace caffe {
 
+
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
+          const Dtype* input_data, const Dtype* target, Dtype* loss,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    const int target_value = static_cast<int>(target[i]);
+    if (has_ignore_label_ && target_value == ignore_label_) {
+      loss[i] = 0;
+      counts[i] = 0;
+    } else {
+      loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
+          log(1 + exp(input_data[i] - 2 * input_data[i] *
+          (input_data[i] >= 0)));
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossIgnoreDiffGPU(const int count,
+    const int ignore_label, const Dtype* target, Dtype* diff) {
+  CUDA_KERNEL_LOOP(i, count) {
+    const int target_value = static_cast<int>(target[i]);
+    if (target_value == ignore_label) {
+      diff[i] = 0;
+    }
+  }
+}
+
+
+template <typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // The forward pass computes the sigmoid outputs.
+  sigmoid_bottom_vec_[0] = bottom[0];
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  // Compute the loss (negative log likelihood)
+  const int count = bottom[0]->count();
+  // Stable version of loss computation from input data
+  const Dtype* input_data = bottom[0]->gpu_data();
+  const Dtype* target = bottom[1]->gpu_data();
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  Dtype* count_data = bottom[1]->mutable_gpu_diff();
+  Dtype valid_count;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,
+      has_ignore_label_, ignore_label_, count_data);
+  // Only launch another CUDA kernel if we actually need the valid count.
+  if (normalization_ == LossParameter_NormalizationMode_VALID &&
+      has_ignore_label_) {
+    caffe_gpu_asum(count, count_data, &valid_count);
+  } else {
+    valid_count = count;
+  }
+  Dtype loss;
+  caffe_gpu_asum(count, loss_data, &loss);
+  normalizer_ = get_normalizer(normalization_, valid_count);
+  top[0]->mutable_cpu_data()[0] = loss / normalizer_;
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
@@ -16,19 +82,23 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   if (propagate_down[0]) {
     // First, compute the diff
     const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
     const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
     const Dtype* target = bottom[1]->gpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     caffe_copy(count, sigmoid_output_data, bottom_diff);
     caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+    // Zero out gradient of ignored targets.
+    if (has_ignore_label_) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      SigmoidCrossEntropyLossIgnoreDiffGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, bottom_diff);
+    }
     // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+    caffe_gpu_scal(count, loss_weight, bottom_diff);
   }
 }
 
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
+INSTANTIATE_LAYER_GPU_FUNCS(SigmoidCrossEntropyLossLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd9676812..f8aa769a174 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -7,7 +7,7 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+  return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index 184c61ede83..8a4ea6616e0 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -8,7 +8,7 @@ namespace caffe {
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
+    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;
   }
 }
 
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 4ca8315d791..1bf3760e9fd 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -173,8 +173,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(crop_size, 0);
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-    this->prefetch_[i].data_.Reshape(
+  for (int i = 0; i < this->prefetch_.size(); ++i)
+    this->prefetch_[i]->data_.Reshape(
         batch_size, channels, crop_size, crop_size);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
@@ -183,8 +183,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 
   // data mean
@@ -265,6 +265,9 @@ void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   const int num_samples[2] = { batch_size - num_fg, num_fg };
 
   int item_id = 0;
+  CHECK_GT(fg_windows_.size(), 0);
+  CHECK_GT(bg_windows_.size(), 0);
+
   // sample from bg set then fg set
   for (int is_fg = 0; is_fg < 2; ++is_fg) {
     for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 23d94c97c07..d8ccbe44139 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -17,29 +17,31 @@
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
-#include "caffe/test/test_caffe_main.hpp"
-
 namespace caffe {
 
 template <typename Dtype>
-Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
-    : root_net_(root_net) {
+Net<Dtype>::Net(const NetParameter& param) {
   Init(param);
 }
 
 template <typename Dtype>
-Net<Dtype>::Net(const string& param_file, Phase phase, const Net* root_net)
-    : root_net_(root_net) {
+Net<Dtype>::Net(const string& param_file, Phase phase,
+    const int level, const vector<string>* stages) {
   NetParameter param;
   ReadNetParamsFromTextFileOrDie(param_file, &param);
+  // Set phase, stages and level
   param.mutable_state()->set_phase(phase);
+  if (stages != NULL) {
+    for (int i = 0; i < stages->size(); i++) {
+      param.mutable_state()->add_stage((*stages)[i]);
+    }
+  }
+  param.mutable_state()->set_level(level);
   Init(param);
 }
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-  CHECK(Caffe::root_solver() || root_net_)
-      << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
@@ -65,9 +67,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   top_id_vecs_.resize(param.layer_size());
   bottom_need_backward_.resize(param.layer_size());
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-    // For non-root solvers, whether this layer is shared from root_net_.
-    bool share_from_root = !Caffe::root_solver()
-        && root_net_->layers_[layer_id]->ShareInParallel();
     // Inherit phase from net if unset.
     if (!param.layer(layer_id).has_phase()) {
       param.mutable_layer(layer_id)->set_phase(phase_);
@@ -80,13 +79,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
           << "propagate_down param must be specified "
           << "either 0 or bottom_size times ";
     }
-    if (share_from_root) {
-      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
-      layers_.push_back(root_net_->layers_[layer_id]);
-      layers_[layer_id]->SetShared(true);
-    } else {
-      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
-    }
+    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     layer_names_.push_back(layer_param.name());
     LOG_IF(INFO, Caffe::root_solver())
         << "Creating Layer " << layer_param.name();
@@ -125,19 +118,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     // After this layer is connected, set it up.
-    if (share_from_root) {
-      // Set up size of top blobs using root_net_
-      const vector<Blob<Dtype>*>& base_top = root_net_->top_vecs_[layer_id];
-      const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
-      for (int top_id = 0; top_id < base_top.size(); ++top_id) {
-        this_top[top_id]->ReshapeLike(*base_top[top_id]);
-        LOG(INFO) << "Created top blob " << top_id << " (shape: "
-            << this_top[top_id]->shape_string() <<  ") for shared layer "
-            << layer_param.name();
-      }
-    } else {
-      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-    }
+    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     LOG_IF(INFO, Caffe::root_solver())
         << "Setting up " << layer_names_[layer_id];
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
@@ -427,12 +408,11 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
   bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
   bottom_id_vecs_[layer_id].push_back(blob_id);
   available_blobs->erase(blob_name);
-  bool propagate_down = true;
+  bool need_backward = blob_need_backward_[blob_id];
   // Check if the backpropagation on bottom_id should be skipped
-  if (layer_param.propagate_down_size() > 0)
-    propagate_down = layer_param.propagate_down(bottom_id);
-  const bool need_backward = blob_need_backward_[blob_id] &&
-                          propagate_down;
+  if (layer_param.propagate_down_size() > 0) {
+    need_backward = layer_param.propagate_down(bottom_id);
+  }
   bottom_need_backward_[layer_id].push_back(need_backward);
   return blob_id;
 }
@@ -538,10 +518,15 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   CHECK_LT(end, layers_.size());
   Dtype loss = 0;
   for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
+    for (int c = 0; c < before_forward_.size(); ++c) {
+      before_forward_[c]->run(i);
+    }
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
+    for (int c = 0; c < after_forward_.size(); ++c) {
+      after_forward_[c]->run(i);
+    }
   }
   return loss;
 }
@@ -583,11 +568,30 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_GE(end, 0);
   CHECK_LT(start, layers_.size());
   for (int i = start; i >= end; --i) {
+    for (int c = 0; c < before_backward_.size(); ++c) {
+      before_backward_[c]->run(i);
+    }
     if (layer_need_backward_[i]) {
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
       if (debug_info_) { BackwardDebugInfo(i); }
     }
+    for (int c = 0; c < after_backward_.size(); ++c) {
+      after_backward_[c]->run(i);
+    }
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::DeconvFromTo(int start, int end, int deconv_type) {
+  CHECK_GE(end, 0);
+  CHECK_LT(start, layers_.size());
+  for (int i = start; i >= end; --i) {
+    if (layer_need_backward_[i]) {
+      layers_[i]->Deconv(
+          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i], deconv_type);
+      if (debug_info_) { DeconvDebugInfo(i); }
+    }
   }
 }
 
@@ -644,6 +648,29 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::DeconvDebugInfo(const int layer_id) {
+  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
+  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
+    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
+    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
+    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Deconv] "
+        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+        << " diff: " << diff_abs_val_mean;
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+       ++param_id) {
+    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Deconv] "
+        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+        << " diff: " << diff_abs_val_mean;
+  }
+}
+
 template <typename Dtype>
 void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const Blob<Dtype>& blob = *params_[param_id];
@@ -731,6 +758,21 @@ void Net<Dtype>::Backward() {
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::DeconvFrom(int start, int deconv_type) {
+  DeconvFromTo(start, 0, deconv_type);
+}
+
+template <typename Dtype>
+void Net<Dtype>::DeconvTo(int end, int deconv_type) {
+  DeconvFromTo(layers_.size() - 1, end, deconv_type);
+}
+
+template <typename Dtype>
+void Net<Dtype>::Deconv(int deconv_type) {
+  DeconvFromTo(layers_.size() - 1, 0, deconv_type);
+}
+
 template <typename Dtype>
 void Net<Dtype>::Reshape() {
   for (int i = 0; i < layers_.size(); ++i) {
@@ -778,8 +820,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+  if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
     CopyTrainedLayersFromBinaryProto(trained_filename);
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
index 5bc41c6a6e5..d9433917d25 100644
--- a/src/caffe/parallel.cpp
+++ b/src/caffe/parallel.cpp
@@ -1,16 +1,15 @@
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
+
 #include <cuda_runtime.h>
-#endif
 #include <glog/logging.h>
 #include <stdio.h>
-
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "boost/thread.hpp"
 #include "caffe/caffe.hpp"
 #include "caffe/parallel.hpp"
+#include "caffe/sgd_solvers.hpp"
 
 namespace caffe {
 
@@ -68,15 +67,14 @@ static size_t total_size(const vector<Blob<Dtype>*>& params) {
 
 template<typename Dtype>
 Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
-    : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
-      data_(),
-      diff_() {
+  : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
+    data_(),
+    diff_() {
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
-    : Params<Dtype>(root_solver) {
-#ifndef CPU_ONLY
+  : Params<Dtype>(root_solver) {
   int initial_device;
   CUDA_CHECK(cudaGetDevice(&initial_device));
 
@@ -86,358 +84,288 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
 
   // Copy blob values
   const vector<Blob<Dtype>*>& net =
-      root_solver->net()->learnable_params();
+    root_solver->net()->learnable_params();
   apply_buffers(net, data_, size_, copy);
 
   CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
   caffe_gpu_set(size_, Dtype(0), diff_);
 
   CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::~GPUParams() {
-#ifndef CPU_ONLY
   CUDA_CHECK(cudaFree(data_));
   CUDA_CHECK(cudaFree(diff_));
-#endif
 }
 
 template<typename Dtype>
-void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
+void GPUParams<Dtype>::Configure(Solver<Dtype>* solver) const {
   const vector<Blob<Dtype>*>& net =
-      solver->net()->learnable_params();
+    solver->net()->learnable_params();
   apply_buffers(net, data_, size_, replace_gpu);
   apply_buffers(net, diff_, size_, replace_gpu_diff);
 }
 
-void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
-#ifndef CPU_ONLY
-  vector<int> remaining(devices);
-
-  // Depth for reduction tree
-  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
-
-  // Group GPUs by board
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        cudaDeviceProp a, b;
-        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
-        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
-        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
-          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
-            pairs->push_back(DevicePair(remaining[i], remaining[j]));
-            DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
-            remaining.erase(remaining.begin() + j);
-            break;
-          }
-        }
-      }
-    }
-  }
-  ostringstream s;
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
-
-  // Group by P2P accessibility
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        int access;
-        CUDA_CHECK(
-            cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
-        if (access) {
-          pairs->push_back(DevicePair(remaining[i], remaining[j]));
-          DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
-          remaining.erase(remaining.begin() + j);
-          break;
-        }
-      }
-    }
-  }
-  s.str("");
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
-
-  // Group remaining
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
-      DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
-                 << remaining[i + 1];
-      remaining.erase(remaining.begin() + i + 1);
-    }
-  }
+static int getDevice() {
+  int device = 0;
+  CUDA_CHECK(cudaGetDevice(&device));
+  return device;
+}
 
-  // Should only be the parent node remaining
-  CHECK_EQ(remaining.size(), 1);
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver)
+  : GPUParams<Dtype>(solver, getDevice()),
+    comm_(), solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Init();
+}
 
-  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid)
+  : GPUParams<Dtype>(solver, getDevice()),
+    solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Caffe::set_multiprocess(true);
+  ncclUniqueId nccl_uid;
+  memcpy(&nccl_uid, &uid[0], NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  NCCL_CHECK(ncclCommInitRank(&comm_,
+                              Caffe::solver_count(),
+                              nccl_uid,
+                              Caffe::solver_rank()));
+  Init();
+}
 
-  CHECK(pairs->size() == devices.size());
-  for (int i = 0; i < pairs->size(); ++i) {
-    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
-    for (int j = i + 1; j < pairs->size(); ++j) {
-      CHECK((*pairs)[i].device() != (*pairs)[j].device());
-    }
+template<typename Dtype>
+void NCCL<Dtype>::Init() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
   }
-#else
-  NO_GPU;
-#endif
 }
 
-//
-
 template<typename Dtype>
-P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                        P2PSync<Dtype>* parent, const SolverParameter& param)
-    : GPUParams<Dtype>(root_solver, param.device_id()),
-      parent_(parent),
-      children_(),
-      queue_(),
-      initial_iter_(root_solver->iter()),
-      solver_() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = param.device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent == NULL) {
-    solver_ = root_solver;
-  } else {
-    Caffe::set_root_solver(false);
-    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
-    Caffe::set_root_solver(true);
+NCCL<Dtype>::~NCCL() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamDestroy(stream_));
   }
-  this->configure(solver_.get());
-  solver_->add_callback(this);
-
-  if (parent) {
-    // Enable p2p access between devices
-    const int peer = parent->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
-    } else {
-      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
-    }
-    // Allocate receiving buffer on parent
-    CUDA_CHECK(cudaSetDevice(peer));
-    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
-    CUDA_CHECK(cudaSetDevice(self));
+  if (comm_) {
+    ncclCommDestroy(comm_);
   }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
-P2PSync<Dtype>::~P2PSync() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = solver_->param().device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent_) {
-    CUDA_CHECK(cudaFree(parent_grads_));
-    const int peer = parent_->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
-    }
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#endif
+boost::barrier* NCCL<Dtype>::barrier() {
+  return barrier_;
+}
+template<typename Dtype>
+void NCCL<Dtype>::set_barrier(boost::barrier* value) {
+  barrier_ = value;
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::InternalThreadEntry() {
-  Caffe::SetDevice(solver_->param().device_id());
-  CHECK(Caffe::root_solver());
-  Caffe::set_root_solver(false);
-  // See if there is a defined seed and reset random state if so
-  if (solver_->param().random_seed() >= 0) {
-    // Fetch random seed and modulate by device ID to make sure
-    // everyone doesn't have the same seed.  We seem to have some
-    // solver instability if we have everyone with the same seed
-    Caffe::set_random_seed(
-        solver_->param().random_seed() + solver_->param().device_id());
+void NCCL<Dtype>::InitSingleProcess(vector<NCCL<Dtype>*>* nccls) {
+  ncclComm_t* comms = new ncclComm_t[nccls->size()];
+  int* gpu_list = new int[nccls->size()];
+  for (int i = 0; i < nccls->size(); ++i) {
+    gpu_list[i] = (*nccls)[i]->solver_->param().device_id();
+  }
+  NCCL_CHECK(ncclCommInitAll(comms, static_cast<int>(nccls->size()), gpu_list));
+  for (int i = 0; i < nccls->size(); ++i) {
+    (*nccls)[i]->comm_ = comms[i];
   }
-  solver_->Step(solver_->param().max_iter() - initial_iter_);
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_start() {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#else
-//  CHECK(false);
-#endif
+string NCCL<Dtype>::new_uid() {
+  string uid;
+  uid.resize(NCCL_UNIQUE_ID_BYTES);
+  ncclUniqueId nccl_uid;
+  NCCL_CHECK(ncclGetUniqueId(&nccl_uid));
+  memcpy(&uid[0], &nccl_uid, NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  return uid;
+}
 
-  // Wait for update from parent
-  if (parent_) {
-    P2PSync<Dtype> *parent = queue_.pop();
-    CHECK(parent == parent_);
+template<typename Dtype>
+void NCCL<Dtype>::Broadcast() {
+  if (barrier_) {  // NULL in multi process case
+    barrier_->wait();
   }
-
-  // Update children
-  for (int i = children_.size() - 1; i >= 0; i--) {
-    Dtype* src = data_;
-    Dtype* dst = children_[i]->data_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == children_[i]->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    children_[i]->queue_.push(this);
+  NCCL_CHECK(ncclBcast(data_, static_cast<int>(size_),
+                       nccl::dataType<Dtype>::type, 0,
+                       comm_, cudaStreamDefault));
+  if (barrier_) {
+    barrier_->wait();
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_gradients_ready() {
-#ifndef CPU_ONLY
+void NCCL<Dtype>::run(int layer) {
+  CHECK(solver_->param().layer_wise_reduce());
+  vector<shared_ptr<Blob<Dtype> > >& blobs =
+    solver_->net()->layers()[layer]->blobs();
 #ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
+  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
+  for (int i = 1; i < blobs.size(); ++i) {
+    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
+             blobs[i + 0]->gpu_diff());
+  }
 #endif
+  if (blobs.size() > 0) {
+    // Make sure default stream is done computing gradients. Could be
+    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
+    // blocking the default stream, but it's actually slower.
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
 
-  // Sum children gradients as they appear in the queue
-  for (int i = 0; i < children_.size(); ++i) {
-    P2PSync<Dtype> *child = queue_.pop();
-    Dtype* src = child->parent_grads_;
-    Dtype* dst = diff_;
-
-#ifdef DEBUG
-    bool ok = false;
-    for (int j = 0; j < children_.size(); ++j) {
-      if (child == children_[j]) {
-        ok = true;
-      }
+    // Reduce asynchronously
+    int size = 0;
+    for (int i = 0; i < blobs.size(); ++i) {
+      size += blobs[i]->count();
     }
-    CHECK(ok);
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == device);
-#endif
-
-    caffe_gpu_add(size_, src, dst, dst);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
+                             blobs[0]->mutable_gpu_diff(),
+                             size,
+                             nccl::dataType<Dtype>::type,
+                             ncclSum, comm_, stream_));
+    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
+                   blobs[0]->mutable_gpu_diff(), stream_);
   }
+}
 
-  // Send gradients to parent
-  if (parent_) {
-    Dtype* src = diff_;
-    Dtype* dst = parent_grads_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == parent_->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    parent_->queue_.push(this);
+template<typename Dtype>
+void NCCL<Dtype>::on_gradients_ready() {
+  if (solver_->param().layer_wise_reduce()) {
+    CHECK_EQ(solver_->net()->params().size(),
+             solver_->net()->learnable_params().size())
+      << "Layer-wise reduce is not supported for nets with shared weights.";
+
+    // Make sure reduction is done before applying gradients
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
   } else {
-    // Loss functions divide gradients by the batch size, so to compensate
-    // for split batch, the root solver divides by number of solvers.
-    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_),
+                             nccl::dataType<Dtype>::type, ncclSum, comm_,
+                             cudaStreamDefault));
+    caffe_gpu_scal(static_cast<int>(size_),
+                   (Dtype) 1.0 / Caffe::solver_count(), diff_);
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::Prepare(const vector<int>& gpus,
-            vector<shared_ptr<P2PSync<Dtype> > >* syncs) {
-  // Pair devices for map-reduce synchronization
-  vector<DevicePair> pairs;
-  DevicePair::compute(gpus, &pairs);
-  ostringstream s;
-  for (int i = 1; i < pairs.size(); ++i) {
-    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
+class Worker : public InternalThread {
+ public:
+  explicit Worker(shared_ptr<Solver<Dtype> > rank0, int device,
+                  boost::barrier* barrier, vector<NCCL<Dtype>*>* nccls,
+                  const char* restore)
+    : rank0_(rank0), device_(device), barrier_(barrier),
+      nccls_(nccls), restore_(restore) {
   }
-  LOG(INFO)<< "GPUs pairs " << s.str();
-
-  SolverParameter param(solver_->param());
-
-  // Build the GPU tree by finding the parent for each solver
-  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
-    for (int i = 1; i < pairs.size(); ++i) {
-      if (!syncs->at(i).get()) {
-        P2PSync<Dtype>* parent = NULL;
-        for (int j = 0; j < syncs->size(); ++j) {
-          P2PSync<Dtype>* sync = j == 0 ? this : syncs->at(j).get();
-          if (sync) {
-            const SolverParameter& p = sync->solver()->param();
-            if (p.device_id() == pairs[i].parent()) {
-              parent = sync;
-            }
-          }
-        }
-        if (parent) {
-          param.set_device_id(pairs[i].device());
-          syncs->at(i).reset(new P2PSync<Dtype>(solver_, parent, param));
-          parent->children_.push_back((P2PSync<Dtype>*) syncs->at(i).get());
-        }
+  virtual ~Worker() {}
+
+ protected:
+  void InternalThreadEntry() {
+    // Create solver and install callbacks
+    SolverParameter param(rank0_->param());
+    param.set_device_id(device_);
+#ifdef DEBUG
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    CHECK_EQ(device, device_);
+#endif
+    param.set_type(rank0_->type());
+    shared_ptr<Solver<Dtype> > s(SolverRegistry<Dtype>::CreateSolver(param));
+    CHECK_EQ(s->type(), rank0_->type());
+    if (restore_) {
+      // Could not make NCCL broadcast solver state, it seems to crash
+      // if called in a tight loop, regardless of barriers etc. so
+      // restore all solvers from file.
+      s->Restore(restore_);
+    }
+    NCCL<Dtype> nccl(s);
+    nccl.set_barrier(barrier_);
+    s->add_callback(&nccl);
+    if (s->param().layer_wise_reduce()) {
+      s->net()->add_after_backward(&nccl);
+    }
+    (*nccls_)[Caffe::solver_rank()] = &nccl;
+    // Wait for other threads
+    barrier_->wait();
+    // Wait for NCCL init
+    barrier_->wait();
+    // Broadcast rank 0 state
+    nccl.Broadcast();
+    // Solve
+    s->Step(param.max_iter() - s->iter());
+    barrier_->wait();
+#ifdef DEBUG
+    // Check all solvers have same state
+    SGDSolver<Dtype>* sa = static_cast<SGDSolver<Dtype>*>(rank0_.get());
+    SGDSolver<Dtype>* sb = static_cast<SGDSolver<Dtype>*>(s.get());
+    for (int h = 0; h < sa->history().size(); ++h) {
+      CUDA_CHECK(cudaSetDevice(sa->param().device_id()));
+      const Dtype* a = sa->history()[h]->cpu_data();
+      CUDA_CHECK(cudaSetDevice(sb->param().device_id()));
+      const Dtype* b = sb->history()[h]->cpu_data();
+      for (int v = 0; v < sa->history()[h]->count(); ++v) {
+        CHECK_DOUBLE_EQ(a[v], b[v]);
       }
     }
+#endif
   }
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::Run(const vector<int>& gpus) {
-  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
-  Prepare(gpus, &syncs);
 
-  LOG(INFO)<< "Starting Optimization";
+  shared_ptr<Solver<Dtype> > rank0_;
+  int device_;
+  boost::barrier* barrier_;
+  vector<NCCL<Dtype>*>* nccls_;
+  const char* restore_;
+};
 
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StartInternalThread();
+template<typename Dtype>
+void NCCL<Dtype>::Run(const vector<int>& gpus, const char* restore) {
+  boost::barrier barrier(static_cast<int>(gpus.size()));
+  vector<NCCL<Dtype>*> nccls(gpus.size());
+  // Create workers
+  vector<shared_ptr<Worker<Dtype> > > workers(gpus.size());
+  for (int i = 1; i < gpus.size(); ++i) {
+    CUDA_CHECK(cudaSetDevice(gpus[i]));
+    Caffe::set_solver_rank(i);
+    Worker<Dtype>* w = new Worker<Dtype>(solver_, gpus[i], &barrier,
+                                         &nccls, restore);
+    w->StartInternalThread();
+    workers[i].reset(w);
   }
-
-  // Run root solver on current thread
+  CUDA_CHECK(cudaSetDevice(gpus[0]));
+  Caffe::set_solver_rank(0);
+  barrier_ = &barrier;
+  solver_->add_callback(this);
+  if (solver_->param().layer_wise_reduce()) {
+    solver_->net()->add_after_backward(this);
+  }
+  nccls[0] = this;
+  // Wait for workers
+  barrier.wait();
+  // Init NCCL
+  InitSingleProcess(&nccls);
+  barrier.wait();
+  // Run first solver on current thread
+  Broadcast();
   solver_->Solve();
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StopInternalThread();
+  barrier.wait();  // Hangs without it when running tests
+  // Wait for shutdown
+  for (int i = 1; i < gpus.size(); ++i) {
+    workers[i]->StopInternalThread();
   }
 }
 
 INSTANTIATE_CLASS(Params);
 INSTANTIATE_CLASS(GPUParams);
-INSTANTIATE_CLASS(P2PSync);
+INSTANTIATE_CLASS(Worker);
+INSTANTIATE_CLASS(NCCL);
 
 }  // namespace caffe
+
+#endif  // USE_NCCL
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 6900bb71482..614f16c240e 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 41 (last added: type)
+// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -128,8 +128,7 @@ message SolverParameter {
   // The states for the train/test nets. Must be unspecified or
   // specified once per net.
   //
-  // By default, all states will have solver = true;
-  // train_state will have phase = TRAIN,
+  // By default, train_state will have phase = TRAIN,
   // and all test_state's will have phase = TEST.
   // Other defaults are set according to the NetState defaults.
   optional NetState train_state = 26;
@@ -219,7 +218,7 @@ message SolverParameter {
 
   // RMSProp decay value
   // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.
@@ -239,6 +238,9 @@ message SolverParameter {
   }
   // DEPRECATED: use type instead of solver_type
   optional SolverType solver_type = 30 [default = SGD];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [default = true];
 }
 
 // A message that stores the solver snapshots
@@ -306,7 +308,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 145 (last added: crop_param)
+// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -328,7 +330,12 @@ message LayerParameter {
   // The blobs containing the numeric parameters of the layer.
   repeated BlobProto blobs = 7;
 
-  // Specifies on which bottoms the backpropagation should be skipped.
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
   // The size must be either 0 or equal to the number of bottoms.
   repeated bool propagate_down = 11;
 
@@ -380,10 +387,12 @@ message LayerParameter {
   optional LRNParameter lrn_param = 118;
   optional MemoryDataParameter memory_data_param = 119;
   optional MVNParameter mvn_param = 120;
+  optional ParameterParameter parameter_param = 145;
   optional PoolingParameter pooling_param = 121;
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
   optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
@@ -411,7 +420,7 @@ message TransformationParameter {
   optional uint32 crop_size = 3 [default = 0];
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
+  // if specified can be repeated once (would subtract it from all the channels)
   // or can be repeated the same number of times as channels
   // (would subtract them from the corresponding channel)
   repeated float mean_value = 5;
@@ -427,7 +436,7 @@ message LossParameter {
   optional int32 ignore_label = 1;
   // How to normalize the loss for loss layers that aggregate across batches,
   // spatial dimensions, or other dimensions.  Currently only implemented in
-  // SoftmaxWithLoss layer.
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
   enum NormalizationMode {
     // Divide by the number of examples in the batch times spatial dimensions.
     // Outputs that receive the ignore label will NOT be ignored in computing
@@ -441,6 +450,8 @@ message LossParameter {
     // Do not normalize the loss.
     NONE = 3;
   }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
   optional NormalizationMode normalization = 3 [default = VALID];
   // Deprecated.  Ignored if normalization is specified.  If normalization
   // is not specified, then setting this to false will be equivalent to
@@ -491,11 +502,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.
@@ -633,6 +654,7 @@ message DataParameter {
   // DEPRECATED. Each solver accesses a different subset of the database.
   optional uint32 rand_skip = 7 [default = 0];
   optional DB backend = 8 [default = LEVELDB];
+
   // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
   // simple scaling and subtracting the data mean, if provided. Note that the
   // mean subtraction is always carried out before scaling.
@@ -646,8 +668,8 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
   optional uint32 prefetch = 10 [default = 4];
 }
 
@@ -794,6 +816,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
@@ -848,6 +871,13 @@ message LRNParameter {
     CUDNN = 2;
   }
   optional Engine engine = 6 [default = DEFAULT];
+
+  // Whether or not to skip the LRN layer during a deconv pass.  If
+  // this is true, activations in a deconv will pass through the LRN
+  // layer unaffected. If it is false, deconv activations will be
+  // affected by LRN layers the same as backprop diffs are (will pass
+  // through the derivative of the layer).
+  optional bool deconv_ignore = 7 [default = false];
 }
 
 message MemoryDataParameter {
@@ -868,6 +898,10 @@ message MVNParameter {
   optional float eps = 3 [default = 1e-9];
 }
 
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
+
 message PoolingParameter {
   enum PoolMethod {
     MAX = 0;
@@ -912,12 +946,29 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
 // Message that stores parameters used by ReductionLayer
 message ReductionParameter {
   enum ReductionOp {
@@ -982,7 +1033,7 @@ message ReshapeParameter {
   //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
   //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
   //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
-  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
   //
   optional BlobShape shape = 1;
 
@@ -1364,6 +1415,6 @@ message PReLUParameter {
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerParameter filler = 1;
-  // Whether or not slope paramters are shared across channels.
+  // Whether or not slope parameters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ece3913e88a..044269371ad 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -26,16 +26,14 @@ SolverAction::Enum Solver<Dtype>::GetRequestedAction() {
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const SolverParameter& param)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   Init(param);
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const string& param_file)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   SolverParameter param;
   ReadSolverParamsFromTextFileOrDie(param_file, &param);
   Init(param);
@@ -43,20 +41,18 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  CHECK(Caffe::root_solver() || root_solver_)
-      << "root_solver_ needs to be set for all non-root solvers";
   LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
     << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
   CheckSnapshotWritePermissions();
-  if (Caffe::root_solver() && param_.random_seed() >= 0) {
-    Caffe::set_random_seed(param_.random_seed());
+  if (param_.random_seed() >= 0) {
+    Caffe::set_random_seed(param_.random_seed() + Caffe::solver_rank());
   }
   // Scaffolding code
   InitTrainNet();
+  InitTestNets();
   if (Caffe::root_solver()) {
-    InitTestNets();
     LOG(INFO) << "Solver scaffolding done.";
   }
   iter_ = 0;
@@ -101,16 +97,11 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(net_param.state());
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
-  if (Caffe::root_solver()) {
-    net_.reset(new Net<Dtype>(net_param));
-  } else {
-    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
-  }
+  net_.reset(new Net<Dtype>(net_param));
 }
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;
@@ -180,12 +171,7 @@ void Solver<Dtype>::InitTestNets() {
     net_params[i].mutable_state()->CopyFrom(net_state);
     LOG(INFO)
         << "Creating test net (#" << i << ") specified by " << sources[i];
-    if (Caffe::root_solver()) {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-    } else {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i],
-          root_solver_->test_nets_[i].get()));
-    }
+    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
   }
 }
@@ -197,14 +183,16 @@ void Solver<Dtype>::Step(int iters) {
   int average_loss = this->param_.average_loss();
   losses_.clear();
   smoothed_loss_ = 0;
+  iteration_timer_.Start();
 
   while (iter_ < stop_iter) {
     // zero-init the params
     net_->ClearParamDiffs();
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())
-        && Caffe::root_solver()) {
-      TestAll();
+        && (iter_ > 0 || param_.test_initialization())) {
+      if (Caffe::root_solver()) {
+        TestAll();
+      }
       if (requested_early_exit_) {
         // Break out of the while loop because stop was requested while testing.
         break;
@@ -225,8 +213,13 @@ void Solver<Dtype>::Step(int iters) {
     // average the loss across iterations for smoothed reporting
     UpdateSmoothedLoss(loss, start_iter, average_loss);
     if (display) {
+      float lapse = iteration_timer_.Seconds();
+      float per_s = (iter_ - iterations_last_) / (lapse ? lapse : 1);
       LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
-          << ", loss = " << smoothed_loss_;
+          << " (" << per_s << " iter/s, " << lapse << "s/"
+          << param_.display() << " iters), loss = " << smoothed_loss_;
+      iteration_timer_.Start();
+      iterations_last_ = iter_;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
@@ -468,7 +461,6 @@ string Solver<Dtype>::SnapshotToHDF5() {
 
 template <typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
-  CHECK(Caffe::root_solver());
   string state_filename(state_file);
   if (state_filename.size() >= 3 &&
       state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) {
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
index e78eadca141..d8107e1e623 100644
--- a/src/caffe/solvers/adagrad_solver.cpp
+++ b/src/caffe/solvers/adagrad_solver.cpp
@@ -12,7 +12,6 @@ void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
 
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype delta = this->param_.delta();
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
index 23ab2d4369a..7c1fac1f884 100644
--- a/src/caffe/solvers/nesterov_solver.cpp
+++ b/src/caffe/solvers/nesterov_solver.cpp
@@ -12,7 +12,6 @@ void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
 
 template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype momentum = this->param_.momentum();
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index f30f316d1a0..ad6abe54a0a 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -100,10 +100,10 @@ void SGDSolver<Dtype>::ClipGradients() {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
-  CHECK(Caffe::root_solver());
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+    LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << this->iter_
+        << ", lr = " << rate;
   }
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->learnable_params().size();
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 4d3564172ab..88d9b78510a 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -3,26 +3,41 @@
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
+SyncedMemory::SyncedMemory()
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
+
+SyncedMemory::SyncedMemory(size_t size)
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
 
 SyncedMemory::~SyncedMemory() {
+  check_device();
   if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
 
 #ifndef CPU_ONLY
   if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
 #endif  // CPU_ONLY
 }
 
 inline void SyncedMemory::to_cpu() {
+  check_device();
   switch (head_) {
   case UNINITIALIZED:
     CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
@@ -49,10 +64,10 @@ inline void SyncedMemory::to_cpu() {
 }
 
 inline void SyncedMemory::to_gpu() {
+  check_device();
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
@@ -60,7 +75,6 @@ inline void SyncedMemory::to_gpu() {
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
       CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
       own_gpu_data_ = true;
     }
@@ -77,11 +91,13 @@ inline void SyncedMemory::to_gpu() {
 }
 
 const void* SyncedMemory::cpu_data() {
+  check_device();
   to_cpu();
   return (const void*)cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
+  check_device();
   CHECK(data);
   if (own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
@@ -92,6 +108,7 @@ void SyncedMemory::set_cpu_data(void* data) {
 }
 
 const void* SyncedMemory::gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   return (const void*)gpu_ptr_;
@@ -102,16 +119,11 @@ const void* SyncedMemory::gpu_data() {
 }
 
 void SyncedMemory::set_gpu_data(void* data) {
+  check_device();
 #ifndef CPU_ONLY
   CHECK(data);
   if (own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
   gpu_ptr_ = data;
   head_ = HEAD_AT_GPU;
@@ -122,12 +134,14 @@ void SyncedMemory::set_gpu_data(void* data) {
 }
 
 void* SyncedMemory::mutable_cpu_data() {
+  check_device();
   to_cpu();
   head_ = HEAD_AT_CPU;
   return cpu_ptr_;
 }
 
 void* SyncedMemory::mutable_gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   head_ = HEAD_AT_GPU;
@@ -140,9 +154,9 @@ void* SyncedMemory::mutable_gpu_data() {
 
 #ifndef CPU_ONLY
 void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
+  check_device();
   CHECK(head_ == HEAD_AT_CPU);
   if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     own_gpu_data_ = true;
   }
@@ -153,5 +167,20 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
 }
 #endif
 
+void SyncedMemory::check_device() {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  int device;
+  cudaGetDevice(&device);
+  CHECK(device == device_);
+  if (gpu_ptr_ && own_gpu_data_) {
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, gpu_ptr_));
+    CHECK(attributes.device == device_);
+  }
+#endif
+#endif
+}
+
 }  // namespace caffe
 
diff --git a/src/caffe/test/CMakeLists.txt b/src/caffe/test/CMakeLists.txt
index 35a803f2f41..d8afc30b76b 100644
--- a/src/caffe/test/CMakeLists.txt
+++ b/src/caffe/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 # The option allows to include in build only selected test files and exclude all others
 # Usage example:
 #  cmake -DBUILD_only_tests="common,net,blob,im2col_kernel"
-set(BUILD_only_tests "" CACHE STRING "Blank or comma-separated list of test files to build without 'test_' prefix and extention")
+set(BUILD_only_tests "" CACHE STRING "Blank or comma-separated list of test files to build without 'test_' prefix and extension")
 caffe_leave_only_selected_tests(test_srcs ${BUILD_only_tests})
 caffe_leave_only_selected_tests(test_cuda ${BUILD_only_tests})
 
diff --git a/src/caffe/test/test_blob.cpp b/src/caffe/test/test_blob.cpp
index a9d7d519e45..b88562223d0 100644
--- a/src/caffe/test/test_blob.cpp
+++ b/src/caffe/test/test_blob.cpp
@@ -51,6 +51,14 @@ TYPED_TEST(BlobSimpleTest, TestReshape) {
   EXPECT_EQ(this->blob_->count(), 120);
 }
 
+TYPED_TEST(BlobSimpleTest, TestReshapeZero) {
+  vector<int> shape(2);
+  shape[0] = 0;
+  shape[1] = 5;
+  this->blob_->Reshape(shape);
+  EXPECT_EQ(this->blob_->count(), 0);
+}
+
 TYPED_TEST(BlobSimpleTest, TestLegacyBlobProtoShapeEquals) {
   BlobProto blob_proto;
 
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index fccf6f1613b..8f333bd7105 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -1,6 +1,3 @@
-// The main caffe test code. Your test cpp code should include this hpp
-// to allow a main function to be compiled into the binary.
-
 #include "caffe/caffe.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -18,7 +15,7 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
+  // Before starting testing, let's first print out a few cuda device info.
   int device;
   cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9bb19d13592..85c10a29483 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -695,7 +695,7 @@ TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
   }
   ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
   for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+    EXPECT_FLOAT_EQ(backward_result_2d.cpu_diff()[i],
               backward_result_nd.cpu_diff()[i]);
   }
   ASSERT_EQ(backward_weight_result_nd.count(),
diff --git a/src/caffe/test/test_crop_layer.cpp b/src/caffe/test/test_crop_layer.cpp
index 45f24e2ee8d..ce2c736f644 100644
--- a/src/caffe/test/test_crop_layer.cpp
+++ b/src/caffe/test/test_crop_layer.cpp
@@ -91,6 +91,24 @@ TYPED_TEST(CropLayerTest, TestSetupShapeNegativeIndexing) {
   }
 }
 
+TYPED_TEST(CropLayerTest, TestDimensionsCheck) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  // Reshape size blob to have incompatible sizes for uncropped dimensions:
+  // the size blob has more channels than the data blob, but this is fine
+  // since the channels dimension is not cropped in this configuration.
+  this->blob_bottom_1_->Reshape(2, 5, 4, 2);
+  CropLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_top_->num_axes(); ++i) {
+    if (i < 2) {
+      EXPECT_EQ(this->blob_bottom_0_->shape(i), this->blob_top_->shape(i));
+    } else {
+      EXPECT_EQ(this->blob_bottom_1_->shape(i), this->blob_top_->shape(i));
+    }
+  }
+}
+
 TYPED_TEST(CropLayerTest, TestCropAll) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index 3e8d113d918..3835af1f173 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -105,6 +105,32 @@ class DataLayerTest : public MultiDeviceTest<TypeParam> {
     }
   }
 
+  void TestSkip() {
+    LayerParameter param;
+    param.set_phase(TRAIN);
+    DataParameter* data_param = param.mutable_data_param();
+    int batch_size = 5;
+    data_param->set_batch_size(batch_size);
+    data_param->set_source(filename_->c_str());
+    data_param->set_backend(backend_);
+    Caffe::set_solver_count(8);
+    for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+      Caffe::set_solver_rank(dev);
+      DataLayer<Dtype> layer(param);
+      layer.SetUp(blob_bottom_vec_, blob_top_vec_);
+      int label = dev;
+      for (int iter = 0; iter < 10; ++iter) {
+        layer.Forward(blob_bottom_vec_, blob_top_vec_);
+        for (int i = 0; i < batch_size; ++i) {
+          EXPECT_EQ(label % batch_size, blob_top_label_->cpu_data()[i]);
+          label += Caffe::solver_count();
+        }
+      }
+    }
+    Caffe::set_solver_count(1);
+    Caffe::set_solver_rank(0);
+  }
+
   void TestReshape(DataParameter_DB backend) {
     const int num_inputs = 5;
     // Save data of varying shapes.
@@ -356,6 +382,11 @@ TYPED_TEST(DataLayerTest, TestReadLevelDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLevelDB) {
+  this->Fill(false, DataParameter_DB_LEVELDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLevelDB) {
   this->TestReshape(DataParameter_DB_LEVELDB);
 }
@@ -396,6 +427,11 @@ TYPED_TEST(DataLayerTest, TestReadLMDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLMDB) {
+  this->Fill(false, DataParameter_DB_LMDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLMDB) {
   this->TestReshape(DataParameter_DB_LMDB);
 }
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
index dc7f5c4aa47..13f13a878d3 100644
--- a/src/caffe/test/test_embed_layer.cpp
+++ b/src/caffe/test/test_embed_layer.cpp
@@ -124,7 +124,7 @@ TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
     top_offset[4] = 0;
     bias_offset[0] = 0;
     for (int j = 0; j < kNumOutput; ++j) {
-      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
+      EXPECT_FLOAT_EQ(layer->blobs()[0]->data_at(weight_offset) +
                 layer->blobs()[1]->data_at(bias_offset),
                 this->blob_top_->data_at(top_offset));
       ++top_offset[4];
diff --git a/src/caffe/test/test_euclidean_loss_layer.cpp b/src/caffe/test/test_euclidean_loss_layer.cpp
index f253f9fd393..b026f5b2077 100644
--- a/src/caffe/test/test_euclidean_loss_layer.cpp
+++ b/src/caffe/test/test_euclidean_loss_layer.cpp
@@ -39,7 +39,7 @@ class EuclideanLossLayerTest : public MultiDeviceTest<TypeParam> {
 
   void TestForward() {
     // Get the loss without a specified objective weight -- should be
-    // equivalent to explicitly specifiying a weight of 1.
+    // equivalent to explicitly specifying a weight of 1.
     LayerParameter layer_param;
     EuclideanLossLayer<Dtype> layer_weight_1(layer_param);
     layer_weight_1.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0f88a..f4395f5311c 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -28,7 +28,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       seed_(1701), num_(4), channels_(3), height_(10), width_(10),
       share_(false) {
         input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+        ABS_TEST_DATA_DIR "/solver_data_list.txt");
       }
   ~GradientBasedSolverTest() {
     delete input_file_;
@@ -36,7 +36,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
 
   string snapshot_prefix_;
   shared_ptr<SGDSolver<Dtype> > solver_;
-  shared_ptr<P2PSync<Dtype> > sync_;
+#ifdef USE_NCCL
+  shared_ptr<NCCL<Dtype> > nccl_;
+#endif
   int seed_;
   // Dimensions are determined by generate_sample_data.py
   // TODO this is brittle and the hdf5 file should be checked instead.
@@ -85,6 +87,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
        "lr_policy: 'fixed' "
        "iter_size: " << iter_size << " "
        "device_id: " << device_id << " "
+       "layer_wise_reduce: " << (!share_) << " "
        "net_param { "
        "  name: 'TestNetwork' "
        "  layer { "
@@ -183,7 +186,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     }
     Caffe::set_random_seed(this->seed_);
     this->InitSolverFromProtoString(proto.str());
-    if (from_snapshot != NULL) {
+    if (from_snapshot) {
       this->solver_->Restore(from_snapshot);
       for (int i = 0; i < this->solver_->iter(); ++i) {
         this->solver_->net()->Forward();
@@ -202,9 +205,10 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
           gpus.push_back(i);
       }
       Caffe::set_solver_count(gpus.size());
-      this->sync_.reset(new P2PSync<Dtype>(
-          this->solver_, NULL, this->solver_->param()));
-      this->sync_->Run(gpus);
+#ifdef USE_NCCL
+      this->nccl_.reset(new NCCL<Dtype>(this->solver_));
+      this->nccl_->Run(gpus, from_snapshot);
+#endif
       Caffe::set_solver_count(1);
     }
     if (snapshot) {
@@ -457,12 +461,28 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const int kIterSize = 1;
     // Test over all numbers of devices.
     int available_devices = 1;
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
     if (Caffe::mode() == Caffe::GPU) {
       CUDA_CHECK(cudaGetDeviceCount(&available_devices));
     }
 #endif
-    for (int devices = 1; devices <= available_devices; ++devices) {
+    // Takes a while to test all sizes for each test so sparse
+    vector<int> sizes;
+    sizes.push_back(1);
+    if (available_devices >= 2) {
+      sizes.push_back(2);
+    }
+    if (available_devices >= 3) {
+      sizes.push_back(3);
+    }
+    if (available_devices >= 8) {
+      sizes.push_back(8);
+    }
+    if (available_devices >= 16) {
+      sizes.push_back(16);
+    }
+    for (int i = 0; i < sizes.size(); ++i) {
+      int devices = sizes[i];
       // Configure batch size for single / multi device equivalence.
       // Constant data is needed for multi device as for accumulation.
       num_ = kNum * devices;
@@ -538,9 +558,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j],
+            params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j],
+            params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -549,9 +571,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j],
+            history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j],
+            history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 3833ebff78e..f94dd57e7de 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -20,8 +20,7 @@ class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
+      : input_file_name_(ABS_TEST_DATA_DIR "/sample_data.h5"),
         blob_data_(new Blob<Dtype>()),
         blob_label_(new Blob<Dtype>()),
         num_(5),
@@ -77,10 +76,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
                           H5P_DEFAULT);
   ASSERT_GE(file_id, 0)<< "Failed to open HDF5 file" <<
       this->input_file_name_;
+  // Allow reshape here as we are loading data not params
+  bool reshape = true;
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       this->blob_data_);
+                       this->blob_data_, reshape);
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       this->blob_label_);
+                       this->blob_label_, reshape);
   herr_t status = H5Fclose(file_id);
   EXPECT_GE(status, 0)<< "Failed to close HDF5 file " <<
       this->input_file_name_;
@@ -105,12 +106,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
 
   Blob<Dtype>* blob_data = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       blob_data);
+                       blob_data, reshape);
   this->CheckBlobEqual(*(this->blob_data_), *blob_data);
 
   Blob<Dtype>* blob_label = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       blob_label);
+                       blob_label, reshape);
   this->CheckBlobEqual(*(this->blob_label_), *blob_label);
 
   status = H5Fclose(file_id);
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 8884ce95a23..3977c4866c7 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -30,8 +30,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
     blob_top_vec_.push_back(blob_top_label2_);
 
     // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
+    filename = new string(ABS_TEST_DATA_DIR "/sample_data_list.txt");
     LOG(INFO)<< "Using sample HDF5 data file " << filename;
   }
 
@@ -70,7 +69,7 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   int height = 6;
   int width = 5;
 
-  // Test that the layer setup got the correct parameters.
+  // Test that the layer setup gives correct parameters.
   HDF5DataLayer<Dtype> layer(param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_data_->num(), batch_size);
@@ -133,4 +132,34 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   }
 }
 
+TYPED_TEST(HDF5DataLayerTest, TestSkip) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter param;
+  param.add_top("data");
+  param.add_top("label");
+
+  HDF5DataParameter* hdf5_data_param = param.mutable_hdf5_data_param();
+  int batch_size = 5;
+  hdf5_data_param->set_batch_size(batch_size);
+  hdf5_data_param->set_source(*(this->filename));
+
+  Caffe::set_solver_count(8);
+  for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+    Caffe::set_solver_rank(dev);
+
+    HDF5DataLayer<Dtype> layer(param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    int label = dev;
+    for (int iter = 0; iter < 1; ++iter) {
+      layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+      for (int i = 0; i < batch_size; ++i) {
+        EXPECT_EQ(1 + label, this->blob_top_label_->cpu_data()[i]);
+        label = (label + Caffe::solver_count()) % (batch_size * 2);
+      }
+    }
+  }
+  Caffe::set_solver_count(1);
+  Caffe::set_solver_rank(0);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp
index a4080ccd145..ce5e0bc62d6 100644
--- a/src/caffe/test/test_image_data_layer.cpp
+++ b/src/caffe/test/test_image_data_layer.cpp
@@ -34,16 +34,24 @@ class ImageDataLayerTest : public MultiDeviceTest<TypeParam> {
     std::ofstream outfile(filename_.c_str(), std::ofstream::out);
     LOG(INFO) << "Using temporary file " << filename_;
     for (int i = 0; i < 5; ++i) {
-      outfile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << i;
+      outfile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << i << std::endl;
     }
     outfile.close();
     // Create test input file for images of distinct sizes.
     MakeTempFilename(&filename_reshape_);
     std::ofstream reshapefile(filename_reshape_.c_str(), std::ofstream::out);
     LOG(INFO) << "Using temporary file " << filename_reshape_;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/fish-bike.jpg " << 1;
+    reshapefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
+    reshapefile << EXAMPLES_SOURCE_DIR "images/fish-bike.jpg " << 1
+                << std::endl;
     reshapefile.close();
+    // Create test input file for images with space in names
+    MakeTempFilename(&filename_space_);
+    std::ofstream spacefile(filename_space_.c_str(), std::ofstream::out);
+    LOG(INFO) << "Using temporary file " << filename_space_;
+    spacefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
+    spacefile << EXAMPLES_SOURCE_DIR "images/cat gray.jpg " << 1 << std::endl;
+    spacefile.close();
   }
 
   virtual ~ImageDataLayerTest() {
@@ -54,6 +62,7 @@ class ImageDataLayerTest : public MultiDeviceTest<TypeParam> {
   int seed_;
   string filename_;
   string filename_reshape_;
+  string filename_space_;
   Blob<Dtype>* const blob_top_data_;
   Blob<Dtype>* const blob_top_label_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
@@ -177,5 +186,34 @@ TYPED_TEST(ImageDataLayerTest, TestShuffle) {
   }
 }
 
+TYPED_TEST(ImageDataLayerTest, TestSpace) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter param;
+  ImageDataParameter* image_data_param = param.mutable_image_data_param();
+  image_data_param->set_batch_size(1);
+  image_data_param->set_source(this->filename_space_.c_str());
+  image_data_param->set_shuffle(false);
+  ImageDataLayer<Dtype> layer(param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_label_->num(), 1);
+  EXPECT_EQ(this->blob_top_label_->channels(), 1);
+  EXPECT_EQ(this->blob_top_label_->height(), 1);
+  EXPECT_EQ(this->blob_top_label_->width(), 1);
+  // cat.jpg
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_data_->num(), 1);
+  EXPECT_EQ(this->blob_top_data_->channels(), 3);
+  EXPECT_EQ(this->blob_top_data_->height(), 360);
+  EXPECT_EQ(this->blob_top_data_->width(), 480);
+  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 0);
+  // cat gray.jpg
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_data_->num(), 1);
+  EXPECT_EQ(this->blob_top_data_->channels(), 3);
+  EXPECT_EQ(this->blob_top_data_->height(), 360);
+  EXPECT_EQ(this->blob_top_data_->width(), 480);
+  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 1);
+}
+
 }  // namespace caffe
 #endif  // USE_OPENCV
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
index a24ac683dc5..34f21271a62 100644
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ b/src/caffe/test/test_infogain_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -18,17 +19,22 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>(4, 2, 5, 2)),
+        blob_bottom_label_(new Blob<Dtype>(4, 2, 1, 2)),
         blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
+        blob_top_loss_(new Blob<Dtype>()),
+        blob_top_prob_(new Blob<Dtype>()),
+        inner_(2), outer_(4*2), num_labels_(5) {
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-0.5);
+    filler_param.set_max(2.0);
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
     for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
+      blob_bottom_label_->mutable_cpu_data()[i] =
+        caffe_rng_rand() % num_labels_;
     }
     blob_bottom_vec_.push_back(blob_bottom_label_);
     filler_param.set_min(0.1);
@@ -37,29 +43,94 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
     infogain_filler.Fill(this->blob_bottom_infogain_);
     blob_bottom_vec_.push_back(blob_bottom_infogain_);
     blob_top_vec_.push_back(blob_top_loss_);
+    blob_top_vec_.push_back(blob_top_prob_);
   }
   virtual ~InfogainLossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_bottom_infogain_;
     delete blob_top_loss_;
+    delete blob_top_prob_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_bottom_infogain_;
   Blob<Dtype>* const blob_top_loss_;
+  Blob<Dtype>* const blob_top_prob_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  int inner_, outer_, num_labels_;
 };
 
 TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
 
+TYPED_TEST(InfogainLossLayerTest, TestInfogainLoss) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
+  layer_param.clear_loss_weight();
+  layer_param.add_loss_weight(1);
+  layer_param.add_loss_weight(0);
+  /*vector<float>* lw = layer_param.mutable_loss_weight();
+  lw->clear();
+  lw->push_back(1);
+  lw->push_back(1);*/
+  InfogainLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* data = this->blob_bottom_vec_[0]->cpu_data();
+  const Dtype* prob = this->blob_top_vec_[1]->cpu_data();
+  const Dtype* labels = this->blob_bottom_vec_[1]->cpu_data();
+  const Dtype* H = this->blob_bottom_vec_[2]->cpu_data();
+  // first. test the prob top
+  CHECK_EQ(this->blob_bottom_vec_[0]->num_axes(),
+    this->blob_top_vec_[1]->num_axes())
+      << "prob top shape not match bottom data";
+  for (int ai = 0 ; ai < this->blob_bottom_vec_[0]->num_axes(); ai++) {
+    CHECK_EQ(this->blob_bottom_vec_[0]->shape(ai),
+      this->blob_top_vec_[1]->shape(ai))
+        << "prob top shape not match bottom data";
+  }
+  vector<Dtype> est_prob(this->num_labels_, 0);
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      Dtype den = 0;
+      for ( int  l = 0; l < this->num_labels_; l++ ) {
+        est_prob[l] = std::exp(
+          data[i*this->num_labels_*this->inner_ + l*this->inner_ + j]);
+        den += est_prob[l];
+      }
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        EXPECT_NEAR(prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+          est_prob[l]/den, 1e-6);
+      }
+    }
+  }
+  Dtype loss = 0;  // loss from prob top
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      int gt = static_cast<int>(labels[i*this->inner_+j]);
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        loss -= H[gt*this->num_labels_ + l] *
+          log(std::max(
+            prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+            Dtype(kLOG_THRESHOLD)));
+      }
+    }
+  }
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0],
+    loss/(this->outer_*this->inner_), 1e-6);
+}
 
 TYPED_TEST(InfogainLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
   InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
+  this->blob_top_vec_.clear();  // ignore prob top.
+  this->blob_top_vec_.push_back(this->blob_top_loss_);
+  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701);  // no "kink"
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index f1ec2333fae..6d84d292b38 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -60,9 +60,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
   EXPECT_EQ(this->blob_top_->channels(), 10);
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeFalse) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -82,9 +82,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
   EXPECT_EQ(60, layer->blobs()[0]->shape(1));
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeTrue) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -339,7 +339,7 @@ TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
     // copy bottom diffs
     Blob<Dtype>* const bottom_diff = new Blob<Dtype>();
     bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true);
-    // repeat original top with tranposed ip
+    // repeat original top with transposed ip
     this->blob_top_vec_.clear();
     this->blob_top_vec_.push_back(new Blob<Dtype>());
     inner_product_param->set_transpose(true);
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
new file mode 100644
index 00000000000..51905baafac
--- /dev/null
+++ b/src/caffe/test/test_lstm_layer.cpp
@@ -0,0 +1,288 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_cont_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape.resize(2);
+    shape[0] = 1; shape[1] = num_instances;
+    unit_blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_cont_;
+  Blob<Dtype> blob_bottom_static_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype> unit_blob_bottom_cont_;
+  Blob<Dtype> unit_blob_bottom_c_prev_;
+  Blob<Dtype> unit_blob_bottom_x_;
+  Blob<Dtype> unit_blob_top_c_;
+  Blob<Dtype> unit_blob_top_h_;
+  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
+  vector<Blob<Dtype>*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  Caffe::set_random_seed(1);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 0;
+  cont_data[1] = 0;
+  cont_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 1;
+  cont_data[1] = 0;
+  cont_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 1e0788ec127..24b957f2acc 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -9,6 +9,7 @@
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/net.hpp"
+#include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -29,6 +30,17 @@ class NetTest : public MultiDeviceTest<TypeParam> {
     net_.reset(new Net<Dtype>(param));
   }
 
+  virtual void InitNetFromProtoFileWithState(const string& proto,
+      Phase phase = caffe::TRAIN, const int level = 0,
+      const vector<string>* stages = NULL) {
+    NetParameter param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
+    string param_file;
+    MakeTempFilename(&param_file);
+    WriteProtoToTextFile(param, param_file);
+    net_.reset(new Net<Dtype>(param_file, phase, level, stages));
+  }
+
   virtual void CopyNetBlobs(const bool copy_diff,
       vector<shared_ptr<Blob<Dtype> > >* blobs_copy) {
     CHECK(net_);
@@ -716,6 +728,117 @@ class NetTest : public MultiDeviceTest<TypeParam> {
     InitNetFromProtoString(proto);
   }
 
+  virtual void InitForcePropNet(bool test_force_true) {
+    string proto =
+      "name: 'ForcePropTestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'DummyData' "
+      "  dummy_data_param { "
+      "    shape { "
+      "      dim: 5 "
+      "      dim: 2 "
+      "      dim: 3 "
+      "      dim: 4 "
+      "    } "
+      "    data_filler { "
+      "      type: 'gaussian' "
+      "      std: 0.01 "
+      "    } "
+      "    shape { "
+      "      dim: 5 "
+      "    } "
+      "    data_filler { "
+      "      type: 'constant' "
+      "      value: 0 "
+      "    } "
+      "  } "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  name: 'innerproduct' "
+      "  type: 'InnerProduct' "
+      "  inner_product_param { "
+      "    num_output: 1 "
+      "    weight_filler { "
+      "      type: 'gaussian' "
+      "      std: 0.01 "
+      "    } "
+      "  } "
+      "  bottom: 'data' "
+      "  top: 'innerproduct' ";
+    if (test_force_true) {
+      proto += "  propagate_down: true ";
+    }
+    proto +=
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  bottom: 'innerproduct' "
+      "  bottom: 'label' "
+      "  top: 'cross_entropy_loss' "
+      "  type: 'SigmoidCrossEntropyLoss' "
+      "} ";
+    InitNetFromProtoString(proto);
+  }
+
+  virtual void InitAllInOneNet(Phase phase = caffe::TRAIN,
+      const int level = 0, const vector<string>* stages = NULL) {
+    string proto =
+      "name: 'All-in-one Network'"
+      "layer { "
+      "  name: 'train-data' "
+      "  type: 'DummyData' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "  dummy_data_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "    shape { dim: 1 dim: 1 } "
+      "  } "
+      "  include { phase: TRAIN stage: 'train' } "
+      "} "
+      "layer { "
+      "  name: 'val-data' "
+      "  type: 'DummyData' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "  dummy_data_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "    shape { dim: 1 dim: 1 } "
+      "  } "
+      "  include { phase: TEST stage: 'val' } "
+      "} "
+      "layer { "
+      "  name: 'deploy-data' "
+      "  type: 'Input' "
+      "  top: 'data' "
+      "  input_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "  } "
+      "  include { phase: TEST stage: 'deploy' } "
+      "} "
+      "layer { "
+      "  name: 'ip' "
+      "  type: 'InnerProduct' "
+      "  bottom: 'data' "
+      "  top: 'ip' "
+      "  inner_product_param { "
+      "    num_output: 2 "
+      "  } "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'ip' "
+      "  bottom: 'label' "
+      "  top: 'loss' "
+      "  include { phase: TRAIN stage: 'train' } "
+      "  include { phase: TEST stage: 'val' } "
+      "} ";
+    InitNetFromProtoFileWithState(proto, phase, level, stages);
+  }
+
   int seed_;
   shared_ptr<Net<Dtype> > net_;
 };
@@ -2371,4 +2494,111 @@ TYPED_TEST(NetTest, TestSkipPropagateDown) {
   }
 }
 
+TYPED_TEST(NetTest, TestForcePropagateDown) {
+  this->InitForcePropNet(false);
+  vector<bool> layer_need_backward = this->net_->layer_need_backward();
+  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
+    const string& layer_name = this->net_->layer_names()[layer_id];
+    const vector<bool> need_backward =
+        this->net_->bottom_need_backward()[layer_id];
+    if (layer_name == "data") {
+      ASSERT_EQ(need_backward.size(), 0);
+      EXPECT_FALSE(layer_need_backward[layer_id]);
+    } else if (layer_name == "innerproduct") {
+      ASSERT_EQ(need_backward.size(), 1);
+      EXPECT_FALSE(need_backward[0]);  // data
+      EXPECT_TRUE(layer_need_backward[layer_id]);
+    } else if (layer_name == "loss") {
+      ASSERT_EQ(need_backward.size(), 2);
+      EXPECT_TRUE(need_backward[0]);   // innerproduct
+      EXPECT_FALSE(need_backward[1]);  // label
+      EXPECT_TRUE(layer_need_backward[layer_id]);
+    } else {
+      LOG(FATAL) << "Unknown layer: " << layer_name;
+    }
+  }
+  this->InitForcePropNet(true);
+  layer_need_backward = this->net_->layer_need_backward();
+  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
+    const string& layer_name = this->net_->layer_names()[layer_id];
+    const vector<bool> need_backward =
+        this->net_->bottom_need_backward()[layer_id];
+    if (layer_name == "data") {
+      ASSERT_EQ(need_backward.size(), 0);
+      EXPECT_FALSE(layer_need_backward[layer_id]);
+    } else if (layer_name == "innerproduct") {
+      ASSERT_EQ(need_backward.size(), 1);
+      EXPECT_TRUE(need_backward[0]);  // data
+      EXPECT_TRUE(layer_need_backward[layer_id]);
+    } else if (layer_name == "loss") {
+      ASSERT_EQ(need_backward.size(), 2);
+      EXPECT_TRUE(need_backward[0]);   // innerproduct
+      EXPECT_FALSE(need_backward[1]);  // label
+      EXPECT_TRUE(layer_need_backward[layer_id]);
+    } else {
+      LOG(FATAL) << "Unknown layer: " << layer_name;
+    }
+  }
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetTrain) {
+  vector<string> stages;
+  stages.push_back("train");
+  this->InitAllInOneNet(caffe::TRAIN, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "train-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetVal) {
+  vector<string> stages;
+  stages.push_back("val");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "val-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetDeploy) {
+  vector<string> stages;
+  stages.push_back("deploy");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "deploy-data") {
+      found_data = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "loss");
+    }
+  }
+  ASSERT_TRUE(found_data);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index dd591f7d204..180871a29ee 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -394,6 +394,26 @@ TYPED_TEST(NeuronLayerTest, TestExpGradient) {
   this->TestExpGradient(kBase, kScale, kShift);
 }
 
+TYPED_TEST(NeuronLayerTest, TestExpLayerWithShift) {
+  typedef typename TypeParam::Dtype Dtype;
+  // Test default base of "-1" -- should actually set base := e,
+  // with a non-zero shift
+  const Dtype kBase = -1;
+  const Dtype kScale = 1;
+  const Dtype kShift = 1;
+  this->TestExpForward(kBase, kScale, kShift);
+}
+
+TYPED_TEST(NeuronLayerTest, TestExpGradientWithShift) {
+  typedef typename TypeParam::Dtype Dtype;
+  // Test default base of "-1" -- should actually set base := e,
+  // with a non-zero shift
+  const Dtype kBase = -1;
+  const Dtype kScale = 1;
+  const Dtype kShift = 1;
+  this->TestExpGradient(kBase, kScale, kShift);
+}
+
 TYPED_TEST(NeuronLayerTest, TestExpLayerBase2) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kBase = 2;
@@ -771,16 +791,19 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s],
+        blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s],
+        ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s],
+        ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+    EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],
         prelu2.blobs()[0]->cpu_diff()[s]);
   }
 }
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
new file mode 100644
index 00000000000..dd8952d62d6
--- /dev/null
+++ b/src/caffe/test/test_rnn_layer.cpp
@@ -0,0 +1,217 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_cont_;
+  Blob<Dtype> blob_bottom_static_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
index 5dfd7656db2..1bd5f93796f 100644
--- a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
@@ -116,5 +116,33 @@ TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestGradient) {
       this->blob_top_vec_, 0);
 }
 
+TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestIgnoreGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  FillerParameter data_filler_param;
+  data_filler_param.set_std(1);
+  GaussianFiller<Dtype> data_filler(data_filler_param);
+  data_filler.Fill(this->blob_bottom_data_);
+  LayerParameter layer_param;
+  LossParameter* loss_param = layer_param.mutable_loss_param();
+  loss_param->set_ignore_label(-1);
+  Dtype* target = this->blob_bottom_targets_->mutable_cpu_data();
+  const int count = this->blob_bottom_targets_->count();
+  // Ignore half of targets, then check that diff of this half is zero,
+  // while the other half is nonzero.
+  caffe_set(count / 2, Dtype(-1), target);
+  SigmoidCrossEntropyLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<bool> propagate_down(2);
+  propagate_down[0] = true;
+  propagate_down[1] = false;
+  layer.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const Dtype* diff = this->blob_bottom_data_->cpu_diff();
+  for (int i = 0; i < count / 2; ++i) {
+    EXPECT_FLOAT_EQ(diff[i], 0.);
+    EXPECT_NE(diff[i + count / 2], 0.);
+  }
+}
+
 
 }  // namespace caffe
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 1d269c351c1..d994225f97b 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -44,7 +44,6 @@ void Timer::Stop() {
     if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
       CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
-      CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
 #else
       NO_GPU;
 #endif
@@ -66,6 +65,7 @@ float Timer::MicroSeconds() {
   }
   if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
+    CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                     stop_gpu_));
     // Cuda only measure milliseconds
@@ -89,6 +89,7 @@ float Timer::MilliSeconds() {
   }
   if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
+    CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                     stop_gpu_));
 #else
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
index 058668fe28c..f69d210459c 100644
--- a/src/caffe/util/blocking_queue.cpp
+++ b/src/caffe/util/blocking_queue.cpp
@@ -1,7 +1,6 @@
 #include <boost/thread.hpp>
 #include <string>
 
-#include "caffe/data_reader.hpp"
 #include "caffe/layers/base_data_layer.hpp"
 #include "caffe/parallel.hpp"
 #include "caffe/util/blocking_queue.hpp"
@@ -88,9 +87,5 @@ size_t BlockingQueue<T>::size() const {
 
 template class BlockingQueue<Batch<float>*>;
 template class BlockingQueue<Batch<double>*>;
-template class BlockingQueue<Datum*>;
-template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
-template class BlockingQueue<P2PSync<float>*>;
-template class BlockingQueue<P2PSync<double>*>;
 
 }  // namespace caffe
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index 0bc82b53e2b..491a9bd03a6 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -7,13 +7,10 @@
 
 namespace caffe { namespace db {
 
-const size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
-
 void LMDB::Open(const string& source, Mode mode) {
   MDB_CHECK(mdb_env_create(&mdb_env_));
-  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
   if (mode == NEW) {
-    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << " failed";
   }
   int flags = 0;
   if (mode == READ) {
@@ -35,7 +32,7 @@ void LMDB::Open(const string& source, Mode mode) {
     MDB_CHECK(rc);
   }
 #endif
-  LOG(INFO) << "Opened lmdb " << source;
+  LOG_IF(INFO, Caffe::root_solver()) << "Opened lmdb " << source;
 }
 
 LMDBCursor* LMDB::NewCursor() {
@@ -48,19 +45,67 @@ LMDBCursor* LMDB::NewCursor() {
 }
 
 LMDBTransaction* LMDB::NewTransaction() {
-  MDB_txn* mdb_txn;
-  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
-  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-  return new LMDBTransaction(&mdb_dbi_, mdb_txn);
+  return new LMDBTransaction(mdb_env_);
 }
 
 void LMDBTransaction::Put(const string& key, const string& value) {
-  MDB_val mdb_key, mdb_value;
-  mdb_key.mv_data = const_cast<char*>(key.data());
-  mdb_key.mv_size = key.size();
-  mdb_value.mv_data = const_cast<char*>(value.data());
-  mdb_value.mv_size = value.size();
-  MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0));
+  keys.push_back(key);
+  values.push_back(value);
+}
+
+void LMDBTransaction::Commit() {
+  MDB_dbi mdb_dbi;
+  MDB_val mdb_key, mdb_data;
+  MDB_txn *mdb_txn;
+
+  // Initialize MDB variables
+  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
+  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi));
+
+  for (int i = 0; i < keys.size(); i++) {
+    mdb_key.mv_size = keys[i].size();
+    mdb_key.mv_data = const_cast<char*>(keys[i].data());
+    mdb_data.mv_size = values[i].size();
+    mdb_data.mv_data = const_cast<char*>(values[i].data());
+
+    // Add data to the transaction
+    int put_rc = mdb_put(mdb_txn, mdb_dbi, &mdb_key, &mdb_data, 0);
+    if (put_rc == MDB_MAP_FULL) {
+      // Out of memory - double the map size and retry
+      mdb_txn_abort(mdb_txn);
+      mdb_dbi_close(mdb_env_, mdb_dbi);
+      DoubleMapSize();
+      Commit();
+      return;
+    }
+    // May have failed for some other reason
+    MDB_CHECK(put_rc);
+  }
+
+  // Commit the transaction
+  int commit_rc = mdb_txn_commit(mdb_txn);
+  if (commit_rc == MDB_MAP_FULL) {
+    // Out of memory - double the map size and retry
+    mdb_dbi_close(mdb_env_, mdb_dbi);
+    DoubleMapSize();
+    Commit();
+    return;
+  }
+  // May have failed for some other reason
+  MDB_CHECK(commit_rc);
+
+  // Cleanup after successful commit
+  mdb_dbi_close(mdb_env_, mdb_dbi);
+  keys.clear();
+  values.clear();
+}
+
+void LMDBTransaction::DoubleMapSize() {
+  struct MDB_envinfo current_info;
+  MDB_CHECK(mdb_env_info(mdb_env_, &current_info));
+  size_t new_size = current_info.me_mapsize * 2;
+  DLOG(INFO) << "Doubling LMDB map size to " << (new_size>>20) << "MB ...";
+  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, new_size));
 }
 
 }  // namespace db
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index 7730e76ab87..ed73742937f 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -9,7 +9,7 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
+    Blob<Dtype>* blob, bool reshape) {
   // Verify that the dataset exists.
   CHECK(H5LTfind_dataset(file_id, dataset_name_))
       << "Failed to find HDF5 dataset " << dataset_name_;
@@ -29,10 +29,10 @@ void hdf5_load_nd_dataset_helper(
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
   switch (class_) {
   case H5T_FLOAT:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT"; }
     break;
   case H5T_INTEGER:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER"; }
     break;
   case H5T_TIME:
     LOG(FATAL) << "Unsupported datatype class: H5T_TIME";
@@ -56,17 +56,38 @@ void hdf5_load_nd_dataset_helper(
     LOG(FATAL) << "Datatype class unknown";
   }
 
+
   vector<int> blob_dims(dims.size());
   for (int i = 0; i < dims.size(); ++i) {
     blob_dims[i] = dims[i];
   }
-  blob->Reshape(blob_dims);
+
+  if (reshape) {
+    blob->Reshape(blob_dims);
+  } else {
+    if (blob_dims != blob->shape()) {
+      // create shape string for error message
+      ostringstream stream;
+      int count = 1;
+      for (int i = 0; i < blob_dims.size(); ++i) {
+        stream << blob_dims[i] << " ";
+        count = count * blob_dims[i];
+      }
+      stream << "(" << count << ")";
+      string source_shape_string = stream.str();
+
+      CHECK(blob_dims == blob->shape()) << "Cannot load blob from hdf5; shape "
+            << "mismatch. Source shape is " << source_shape_string
+            << " target shape is " << blob->shape_string();
+    }
+  }
 }
 
 template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<float>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_float(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
@@ -74,8 +95,9 @@ void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
 
 template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<double>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_double(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c02274a75..59625bc05ce 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -196,6 +196,16 @@ void caffe_sqr<double>(const int n, const double* a, double* y) {
   vdSqr(n, a, y);
 }
 
+template <>
+void caffe_sqrt<float>(const int n, const float* a, float* y) {
+  vsSqrt(n, a, y);
+}
+
+template <>
+void caffe_sqrt<double>(const int n, const double* a, double* y) {
+  vdSqrt(n, a, y);
+}
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
   vsExp(n, a, y);
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 4c587537435..314e6ba0f63 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -90,6 +90,26 @@ void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
   CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float* X,
+                           cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double* X,
+                            cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
 template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
     const float beta, float* Y) {
@@ -367,6 +387,27 @@ void caffe_gpu_powx<double>(const int N, const double* a,
       N, a, alpha, y);
 }
 
+template <typename Dtype>
+__global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = sqrt(a[index]);
+  }
+}
+
+template <>
+void caffe_gpu_sqrt<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
+template <>
+void caffe_gpu_sqrt<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
                                       - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 9e186915b43..94771c8c050 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -14,7 +14,8 @@ namespace caffe {
 
 bool NetNeedsUpgrade(const NetParameter& net_param) {
   return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param)
-      || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param);
+      || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param)
+      || NetNeedsBatchNormUpgrade(net_param);
 }
 
 bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
@@ -71,6 +72,14 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
     LOG(WARNING) << "Note that future Caffe releases will only support "
                  << "input layers and not input fields.";
   }
+  // NetParameter uses old style batch norm layers; try to upgrade it.
+  if (NetNeedsBatchNormUpgrade(*param)) {
+    LOG(INFO) << "Attempting to upgrade batch norm layers using deprecated "
+              << "params: " << param_file;
+    UpgradeNetBatchNorm(param);
+    LOG(INFO) << "Successfully upgraded batch norm layers using deprecated "
+              << "params.";
+  }
   return success;
 }
 
@@ -991,6 +1000,35 @@ void UpgradeNetInput(NetParameter* net_param) {
   net_param->clear_input_dim();
 }
 
+bool NetNeedsBatchNormUpgrade(const NetParameter& net_param) {
+  for (int i = 0; i < net_param.layer_size(); ++i) {
+    // Check if BatchNorm layers declare three parameters, as required by
+    // the previous BatchNorm layer definition.
+    if (net_param.layer(i).type() == "BatchNorm"
+        && net_param.layer(i).param_size() == 3) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void UpgradeNetBatchNorm(NetParameter* net_param) {
+  for (int i = 0; i < net_param->layer_size(); ++i) {
+    // Check if BatchNorm layers declare three parameters, as required by
+    // the previous BatchNorm layer definition.
+    if (net_param->layer(i).type() == "BatchNorm"
+        && net_param->layer(i).param_size() == 3) {
+      // set lr_mult and decay_mult to zero. leave all other param intact.
+      for (int ip = 0; ip < net_param->layer(i).param_size(); ip++) {
+        ParamSpec* fixed_param_spec =
+          net_param->mutable_layer(i)->mutable_param(ip);
+        fixed_param_spec->set_lr_mult(0.f);
+        fixed_param_spec->set_decay_mult(0.f);
+      }
+    }
+  }
+}
+
 // Return true iff the solver contains any old solver_type specified as enums
 bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param) {
   if (solver_param.has_solver_type()) {
diff --git a/src/gtest/CMakeLists.txt b/src/gtest/CMakeLists.txt
index ef7ff7ed14b..e98254af130 100644
--- a/src/gtest/CMakeLists.txt
+++ b/src/gtest/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_library(gtest STATIC EXCLUDE_FROM_ALL gtest.h gtest-all.cpp)
 caffe_default_properties(gtest)
+target_include_directories(gtest PUBLIC ${Caffe_SRC_DIR})
+target_compile_definitions(gtest PUBLIC -DGTEST_USE_OWN_TR1_TUPLE)
+
 
 #add_library(gtest_main gtest_main.cc)
 #target_link_libraries(gtest_main gtest)
diff --git a/src/gtest/gtest-all.cpp b/src/gtest/gtest-all.cpp
index 926197419fc..81cdb578cd5 100644
--- a/src/gtest/gtest-all.cpp
+++ b/src/gtest/gtest-all.cpp
@@ -2697,7 +2697,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -7550,7 +7550,7 @@ FilePath FilePath::RemoveExtension(const char* extension) const {
   return *this;
 }
 
-// Returns a pointer to the last occurence of a valid path separator in
+// Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
 const char* FilePath::FindLastPathSeparator() const {
diff --git a/src/gtest/gtest.h b/src/gtest/gtest.h
index 3143bd67996..124fb2321f9 100644
--- a/src/gtest/gtest.h
+++ b/src/gtest/gtest.h
@@ -3395,7 +3395,7 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 02fbd5cadd8..3789450555e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(source ${srcs})
   endif()
 
   # Install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 endforeach(source)
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 5d9331f0c22..3587d8aa1be 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -33,7 +33,14 @@ DEFINE_string(gpu, "",
 DEFINE_string(solver, "",
     "The solver definition protocol buffer text file.");
 DEFINE_string(model, "",
-    "The model definition protocol buffer text file..");
+    "The model definition protocol buffer text file.");
+DEFINE_string(phase, "",
+    "Optional; network phase (TRAIN or TEST). Only used for 'time'.");
+DEFINE_int32(level, 0,
+    "Optional; network level.");
+DEFINE_string(stage, "",
+    "Optional; network stages (not to be confused with phase), "
+    "separated by ','.");
 DEFINE_string(snapshot, "",
     "Optional; the snapshot solver state to resume training.");
 DEFINE_string(weights, "",
@@ -101,6 +108,25 @@ static void get_gpus(vector<int>* gpus) {
   }
 }
 
+// Parse phase from flags
+caffe::Phase get_phase_from_flags(caffe::Phase default_value) {
+  if (FLAGS_phase == "")
+    return default_value;
+  if (FLAGS_phase == "TRAIN")
+    return caffe::TRAIN;
+  if (FLAGS_phase == "TEST")
+    return caffe::TEST;
+  LOG(FATAL) << "phase must be \"TRAIN\" or \"TEST\"";
+  return caffe::TRAIN;  // Avoid warning
+}
+
+// Parse stages from flags
+vector<string> get_stages_from_flags() {
+  vector<string> stages;
+  boost::split(stages, FLAGS_stage, boost::is_any_of(","));
+  return stages;
+}
+
 // caffe commands to call by
 //     caffe <command> <args>
 //
@@ -156,13 +182,20 @@ int train() {
   CHECK(!FLAGS_snapshot.size() || !FLAGS_weights.size())
       << "Give a snapshot to resume training or weights to finetune "
       "but not both.";
+  vector<string> stages = get_stages_from_flags();
 
   caffe::SolverParameter solver_param;
   caffe::ReadSolverParamsFromTextFileOrDie(FLAGS_solver, &solver_param);
 
+  solver_param.mutable_train_state()->set_level(FLAGS_level);
+  for (int i = 0; i < stages.size(); i++) {
+    solver_param.mutable_train_state()->add_stage(stages[i]);
+  }
+
   // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
   if (FLAGS_gpu.size() == 0
+      && solver_param.has_solver_mode()
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
       if (solver_param.has_device_id()) {
           FLAGS_gpu = "" +
@@ -212,11 +245,15 @@ int train() {
     CopyLayers(solver.get(), FLAGS_weights);
   }
 
+  LOG(INFO) << "Starting Optimization";
   if (gpus.size() > 1) {
-    caffe::P2PSync<float> sync(solver, NULL, solver->param());
-    sync.Run(gpus);
+#ifdef USE_NCCL
+    caffe::NCCL<float> nccl(solver);
+    nccl.Run(gpus, FLAGS_snapshot.size() > 0 ? FLAGS_snapshot.c_str() : NULL);
+#else
+    LOG(FATAL) << "Multi-GPU execution not available - rebuild with USE_NCCL";
+#endif
   } else {
-    LOG(INFO) << "Starting Optimization";
     solver->Solve();
   }
   LOG(INFO) << "Optimization Done.";
@@ -229,6 +266,7 @@ RegisterBrewFunction(train);
 int test() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to score.";
   CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
+  vector<string> stages = get_stages_from_flags();
 
   // Set device id and mode
   vector<int> gpus;
@@ -247,7 +285,7 @@ int test() {
     Caffe::set_mode(Caffe::CPU);
   }
   // Instantiate the caffe net.
-  Net<float> caffe_net(FLAGS_model, caffe::TEST);
+  Net<float> caffe_net(FLAGS_model, caffe::TEST, FLAGS_level, &stages);
   caffe_net.CopyTrainedLayersFrom(FLAGS_weights);
   LOG(INFO) << "Running for " << FLAGS_iterations << " iterations.";
 
@@ -300,6 +338,8 @@ RegisterBrewFunction(test);
 // Time: benchmark the execution time of a model.
 int time() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time.";
+  caffe::Phase phase = get_phase_from_flags(caffe::TRAIN);
+  vector<string> stages = get_stages_from_flags();
 
   // Set device id and mode
   vector<int> gpus;
@@ -313,7 +353,7 @@ int time() {
     Caffe::set_mode(Caffe::CPU);
   }
   // Instantiate the caffe net.
-  Net<float> caffe_net(FLAGS_model, caffe::TRAIN);
+  Net<float> caffe_net(FLAGS_model, phase, FLAGS_level, &stages);
 
   // Do a clean forward and backward pass, so that memory allocation are done
   // and future iterations will be more stable.
diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d515195..417f5e4c622 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -22,9 +22,11 @@ DEFINE_string(backend, "lmdb",
         "The backend {leveldb, lmdb} containing the images");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -65,7 +67,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < size_in_datum; ++i) {
     sum_blob.add_data(0.);
   }
-  LOG(INFO) << "Starting Iteration";
+  LOG(INFO) << "Starting iteration";
   while (cursor->valid()) {
     Datum datum;
     datum.ParseFromString(cursor->value());
@@ -114,7 +116,7 @@ int main(int argc, char** argv) {
     for (int i = 0; i < dim; ++i) {
       mean_values[c] += sum_blob.data(dim * c + i);
     }
-    LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
+    LOG(INFO) << "mean_value channel [" << c << "]: " << mean_values[c] / dim;
   }
 #else
   LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index 9c52bfa0ef8..90cdb15d427 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -73,10 +73,13 @@ int main(int argc, char** argv) {
 
   std::ifstream infile(argv[2]);
   std::vector<std::pair<std::string, int> > lines;
-  std::string filename;
+  std::string line;
+  size_t pos;
   int label;
-  while (infile >> filename >> label) {
-    lines.push_back(std::make_pair(filename, label));
+  while (std::getline(infile, line)) {
+    pos = line.find_last_of(' ');
+    label = atoi(line.substr(pos + 1).c_str());
+    lines.push_back(std::make_pair(line.substr(0, pos), label));
   }
   if (FLAGS_shuffle) {
     // randomly shuffle data
diff --git a/tools/extra/extract_seconds.py b/tools/extra/extract_seconds.py
index 591a51f96bd..68af69a2788 100755
--- a/tools/extra/extract_seconds.py
+++ b/tools/extra/extract_seconds.py
@@ -48,11 +48,19 @@ def extract_seconds(input_file, output_file):
     start_datetime = get_start_time(lines, log_created_year)
     assert start_datetime, 'Start time not found'
 
+    last_dt = start_datetime
     out = open(output_file, 'w')
     for line in lines:
         line = line.strip()
         if line.find('Iteration') != -1:
             dt = extract_datetime_from_line(line, log_created_year)
+
+            # if it's another year
+            if dt.month < last_dt.month:
+                log_created_year += 1
+                dt = extract_datetime_from_line(line, log_created_year)
+            last_dt = dt
+
             elapsed_seconds = (dt - start_datetime).total_seconds()
             out.write('%f\n' % elapsed_seconds)
     out.close()
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index bb9b65ad615..4248e2b87a3 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -16,13 +16,10 @@
 
 def parse_log(path_to_log):
     """Parse log file
-    Returns (train_dict_list, train_dict_names, test_dict_list, test_dict_names)
+    Returns (train_dict_list, test_dict_list)
 
     train_dict_list and test_dict_list are lists of dicts that define the table
     rows
-
-    train_dict_names and test_dict_names are ordered tuples of the column names
-    for the two dict_lists
     """
 
     regex_iteration = re.compile('Iteration (\d+)')
@@ -41,6 +38,7 @@ def parse_log(path_to_log):
     logfile_year = extract_seconds.get_log_created_year(path_to_log)
     with open(path_to_log) as f:
         start_time = extract_seconds.get_start_time(f, logfile_year)
+        last_time = start_time
 
         for line in f:
             iteration_match = regex_iteration.search(line)
@@ -51,8 +49,19 @@ def parse_log(path_to_log):
                 # iteration
                 continue
 
-            time = extract_seconds.extract_datetime_from_line(line,
-                                                              logfile_year)
+            try:
+                time = extract_seconds.extract_datetime_from_line(line,
+                                                                  logfile_year)
+            except ValueError:
+                # Skip lines with bad formatting, for example when resuming solver
+                continue
+
+            # if it's another year
+            if time.month < last_time.month:
+                logfile_year += 1
+                time = extract_seconds.extract_datetime_from_line(line, logfile_year)
+            last_time = time
+
             seconds = (time - start_time).total_seconds()
 
             learning_rate_match = regex_learning_rate.search(line)
@@ -194,7 +203,7 @@ def main():
     args = parse_args()
     train_dict_list, test_dict_list = parse_log(args.logfile_path)
     save_csv_files(args.logfile_path, args.output_dir, train_dict_list,
-                   test_dict_list, delimiter=args.delimiter)
+                   test_dict_list, delimiter=args.delimiter, verbose=args.verbose)
 
 
 if __name__ == '__main__':
diff --git a/tools/extra/parse_log.sh b/tools/extra/parse_log.sh
index 9892c897682..122eb9e6eed 100755
--- a/tools/extra/parse_log.sh
+++ b/tools/extra/parse_log.sh
@@ -39,7 +39,7 @@ rm aux.txt aux0.txt aux1.txt aux2.txt aux3.txt aux4.txt
 grep '] Solving ' $1 > aux.txt
 grep ', loss = ' $1 >> aux.txt
 grep 'Iteration ' aux.txt | sed  's/.*Iteration \([[:digit:]]*\).*/\1/g' > aux0.txt
-grep ', loss = ' $1 | awk '{print $9}' > aux1.txt
+grep ', loss = ' $1 | awk -F = '{print $2}' > aux1.txt
 grep ', lr = ' $1 | awk '{print $9}' > aux2.txt
 
 # Extracting elapsed seconds
diff --git a/tools/extra/plot_log.gnuplot.example b/tools/extra/plot_log.gnuplot.example
index 748b96e6925..02c68e1d24f 100644
--- a/tools/extra/plot_log.gnuplot.example
+++ b/tools/extra/plot_log.gnuplot.example
@@ -4,7 +4,7 @@
 # Be warned that the fields in the training log may change in the future.
 # You had better check the data files before designing your own plots.
 
-# Please generate the neccessary data files with 
+# Please generate the necessary data files with 
 # /path/to/caffe/tools/extra/parse_log.sh before plotting.
 # Example usage: 
 #     ./parse_log.sh mnist.log
diff --git a/tools/extra/plot_training_log.py.example b/tools/extra/plot_training_log.py.example
index 4d3ed0d15a9..8caca6b8a67 100755
--- a/tools/extra/plot_training_log.py.example
+++ b/tools/extra/plot_training_log.py.example
@@ -10,7 +10,8 @@ import matplotlib.legend as lgd
 import matplotlib.markers as mks
 
 def get_log_parsing_script():
-    dirname = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    dirname = os.path.dirname(os.path.abspath(inspect.getfile(
+        inspect.currentframe())))
     return dirname + '/parse_log.sh'
 
 def get_log_file_suffix():
@@ -61,16 +62,17 @@ def get_data_file_type(chart_type):
     return data_file_type
 
 def get_data_file(chart_type, path_to_log):
-    return os.path.basename(path_to_log) + '.' + get_data_file_type(chart_type).lower()
+    return (os.path.basename(path_to_log) + '.' +
+            get_data_file_type(chart_type).lower())
 
 def get_field_descriptions(chart_type):
     description = get_chart_type_description(chart_type).split(
         get_chart_type_description_separator())
     y_axis_field = description[0]
     x_axis_field = description[1]
-    return x_axis_field, y_axis_field    
+    return x_axis_field, y_axis_field
 
-def get_field_indecies(x_axis_field, y_axis_field):    
+def get_field_indices(x_axis_field, y_axis_field):
     data_file_type = get_data_file_type(chart_type)
     fields = create_field_index()[0][data_file_type]
     return fields[x_axis_field], fields[y_axis_field]
@@ -88,9 +90,9 @@ def load_data(data_file, field_idx0, field_idx1):
 
 def random_marker():
     markers = mks.MarkerStyle.markers
-    num = len(markers.values())
+    num = len(markers.keys())
     idx = random.randint(0, num - 1)
-    return markers.values()[idx]
+    return markers.keys()[idx]
 
 def get_data_label(path_to_log):
     label = path_to_log[path_to_log.rfind('/')+1 : path_to_log.rfind(
@@ -111,7 +113,7 @@ def plot_chart(chart_type, path_to_png, path_to_log_list):
         os.system('%s %s' % (get_log_parsing_script(), path_to_log))
         data_file = get_data_file(chart_type, path_to_log)
         x_axis_field, y_axis_field = get_field_descriptions(chart_type)
-        x, y = get_field_indecies(x_axis_field, y_axis_field)
+        x, y = get_field_indices(x_axis_field, y_axis_field)
         data = load_data(data_file, x, y)
         ## TODO: more systematic color cycle for lines
         color = [random.random(), random.random(), random.random()]
@@ -124,22 +126,15 @@ def plot_chart(chart_type, path_to_png, path_to_log_list):
             plt.plot(data[0], data[1], label = label, color = color,
                      linewidth = linewidth)
         else:
-            ok = False
-            ## Some markers throw ValueError: Unrecognized marker style
-            while not ok:
-                try:
-                    marker = random_marker()
-                    plt.plot(data[0], data[1], label = label, color = color,
-                             marker = marker, linewidth = linewidth)
-                    ok = True
-                except:
-                    pass
+            marker = random_marker()
+            plt.plot(data[0], data[1], label = label, color = color,
+                     marker = marker, linewidth = linewidth)
     legend_loc = get_legend_loc(chart_type)
     plt.legend(loc = legend_loc, ncol = 1) # ajust ncol to fit the space
     plt.title(get_chart_type_description(chart_type))
     plt.xlabel(x_axis_field)
-    plt.ylabel(y_axis_field)  
-    plt.savefig(path_to_png)     
+    plt.ylabel(y_axis_field)
+    plt.savefig(path_to_png)
     plt.show()
 
 def print_help():
@@ -160,28 +155,30 @@ Supported chart types:""" % (len(get_supported_chart_types()) - 1,
     num = len(supported_chart_types)
     for i in xrange(num):
         print '    %d: %s' % (i, supported_chart_types[i])
-    exit
+    sys.exit()
 
 def is_valid_chart_type(chart_type):
     return chart_type >= 0 and chart_type < len(get_supported_chart_types())
-  
+
 if __name__ == '__main__':
     if len(sys.argv) < 4:
         print_help()
     else:
         chart_type = int(sys.argv[1])
         if not is_valid_chart_type(chart_type):
+            print '%s is not a valid chart type.' % chart_type
             print_help()
         path_to_png = sys.argv[2]
         if not path_to_png.endswith('.png'):
             print 'Path must ends with png' % path_to_png
-            exit            
+            sys.exit()
         path_to_logs = sys.argv[3:]
         for path_to_log in path_to_logs:
             if not os.path.exists(path_to_log):
                 print 'Path does not exist: %s' % path_to_log
-                exit
+                sys.exit()
             if not path_to_log.endswith(get_log_file_suffix()):
+                print 'Log file must end in %s.' % get_log_file_suffix()
                 print_help()
         ## plot_chart accpets multiple path_to_logs
         plot_chart(chart_type, path_to_png, path_to_logs)
diff --git a/tools/extra/resize_and_crop_images.py b/tools/extra/resize_and_crop_images.py
index c844f590c06..fd2c3134edb 100755
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
@@ -101,7 +101,7 @@ def map(self, key, value):
         yield value, FLAGS.output_folder
 
 mapreducer.REGISTER_DEFAULT_MAPPER(ResizeCropImagesMapper)
-
+mapreducer.REGISTER_DEFAULT_REDUCER(mapreducer.NoPassReducer)
 mapreducer.REGISTER_DEFAULT_READER(mapreducer.FileReader)
 mapreducer.REGISTER_DEFAULT_WRITER(mapreducer.FileWriter)
  
diff --git a/tools/extract_features.cpp b/tools/extract_features.cpp
index 704467250a6..51c791e4021 100644
--- a/tools/extract_features.cpp
+++ b/tools/extract_features.cpp
@@ -130,7 +130,7 @@ int feature_extraction_pipeline(int argc, char** argv) {
     txns.push_back(txn);
   }
 
-  LOG(ERROR)<< "Extacting Features";
+  LOG(ERROR)<< "Extracting Features";
 
   Datum datum;
   std::vector<int> image_indices(num_features, 0);