diff --git a/dynet/exec.cc b/dynet/exec.cc
index 8594dd70d..6240a61a1 100644
--- a/dynet/exec.cc
+++ b/dynet/exec.cc
@@ -566,8 +566,11 @@ const Tensor& BatchedExecutionEngine::incremental_forward_no_update(
           *(active_un_end++) = j;
         }
       }
-      for (size_t j = 0; j < (size_t)sigmap.size(); ++j)
-        prof2avg[j] /= prof2cnt[j];
+      for (size_t j = 0; j < (size_t)sigmap.size(); ++j) // -prh sigmap.size() == 40
+      {
+        if(std::abs(prof2cnt[j]) > 1e-8)
+          prof2avg[j] /= prof2cnt[j];
+      }
 
       // 2) Travel through and do active nodes
       while (node_id != (VariableIndex)uptop1) {
@@ -1060,7 +1063,7 @@ void BatchedExecutionEngine::backward(VariableIndex from_where, bool full) {
         // No concatenation whatsoever
         if (my_batch.concat[ai] == 0) {
           if (needs_derivative[node2batch[arg]]) {
-            node->backward(xs, my_batch.nfx, batched_ndEdfs[i], ai, batched_ndEdfs[node2batch[arg]]);
+            node->backward(xs, my_batch.nfx, batched_ndEdfs[i], ai, ndEdfs[arg]);
             // cerr << "batched backward[" << i << "](" << ai << ")->" << node2batch[arg] << " == " << print_vec(as_vector(batched_ndEdfs[node2batch[arg]])) << endl;
           }
         // Needs concatenation
diff --git a/dynet/matrix-multiply.h b/dynet/matrix-multiply.h
index 08794fd03..4d7858896 100644
--- a/dynet/matrix-multiply.h
+++ b/dynet/matrix-multiply.h
@@ -162,6 +162,11 @@ inline void MatrixTranspMultiplyAcc(const dynet::Device_CPU & dev, const dynet::
 inline void MatrixMultiplyTranspAcc(const dynet::Device_GPU & dev, const dynet::Tensor& l, const dynet::Tensor& r, dynet::Tensor& y) {
   int max_b = std::max(l.d.bd, r.d.bd);
   if(y.d.bd == 1 && (l.d.bd == r.d.bd)) {
+    DYNET_ARG_CHECK(l.d.rows() == y.d.rows(), "MatrixMultiplyTranspAcc: l.d.rows() != y.d.rows()");
+    DYNET_ARG_CHECK(r.d.rows() == y.d.cols(), "MatrixMultiplyTranspAcc: r.d.rows() != y.d.cols()");
+    DYNET_ARG_CHECK(l.d.cols() == r.d.cols(), "MatrixMultiplyTranspAcc: l.d.cols() != r.d.cols()");
+    DYNET_ARG_CHECK(l.d.batch_elems() == r.d.batch_elems(), "MatrixMultiplyTranspAcc: l.d.batch_elems() != r.d.batch_elems()");
+
     CUBLAS_CHECK(cublasSgemm(dev.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T,
           y.d.rows(), y.d.cols(), l.d.cols() * l.d.batch_elems(),
           dev.kSCALAR_ONE,
@@ -183,6 +188,10 @@ inline void MatrixMultiplyTranspAcc(const dynet::Device_GPU & dev, const dynet::
 inline void MatrixMultiplyTranspAcc(const dynet::Device_CPU & dev, const dynet::Tensor& l, const dynet::Tensor& r, dynet::Tensor& y) {
   int max_b = std::max(l.d.bd, r.d.bd);
   if(y.d.bd == 1 && (l.d.bd == r.d.bd)) {
+    DYNET_ARG_CHECK(l.d.rows() == y.d.rows(), "MatrixMultiplyTranspAcc [CPU]: l.d.rows() != y.d.rows()");
+    DYNET_ARG_CHECK(r.d.rows() == y.d.cols(), "MatrixMultiplyTranspAcc [CPU]: r.d.rows() != y.d.cols()");
+    DYNET_ARG_CHECK(l.d.cols() == r.d.cols(), "MatrixMultiplyTranspAcc [CPU]: l.d.cols() != r.d.cols()");
+    DYNET_ARG_CHECK(l.d.batch_elems() == r.d.batch_elems(), "MatrixMultiplyTranspAcc [CPU]: l.d.batch_elems() != r.d.batch_elems()");
     mat(y).noalias() += colbatch_matrix(l) * colbatch_matrix(r).transpose();
   } else {
     #ifdef __INTEL_MKL__
diff --git a/dynet/nodes-matrixmultiply.cc b/dynet/nodes-matrixmultiply.cc
index 8ee92f2a7..ccf318c6f 100644
--- a/dynet/nodes-matrixmultiply.cc
+++ b/dynet/nodes-matrixmultiply.cc
@@ -51,6 +51,7 @@ template<class MyDevice>
 void MatrixMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
   DYNET_ASSERT(xs.size() == 2, "Failed dimension check in MatrixMultiply::forward");
   DYNET_ARG_CHECK(fx.d.bd == max(xs[0]->d.bd, xs[1]->d.bd), "Failed dimension check in MatrixMultiply::forward");
+  DYNET_ARG_CHECK(fx.d.batch_size() == dim_forward({xs[0]->d, xs[1]->d}).batch_size(), "Failed result dimension check in MatrixMultiply::forward");
   // fx = mat(fx0) + xs[0] * xs[1]
   dynet::MatrixMultiply(dev, *xs[0], *xs[1], fx, dev.kSCALAR_ZERO);
 }