diff --git a/.travis.yml b/.travis.yml
index beda761..8eabda2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,10 +30,11 @@ install:
 
 script:
     - pip install runtests
-    - mpirun -n 1 python $PWD/tests/roundtrip.py -v -Nproc 1 1 -Nmesh 13 15 16
-    - mpirun -n 2 python $PWD/tests/roundtrip.py -v -Nproc 2 1 -Nproc 1 2 -Nmesh 13 15 16
-    - mpirun -n 4 python $PWD/tests/roundtrip.py -v -Nproc 2 2 -Nmesh 13 15 16
     - python ./runtests.py --mpirun="mpirun -np 4"
+    - mpirun -n 1 python $PWD/scripts/pfft-roundtrip-matrix.py -v -diag -Nproc 1 1 -Nmesh 13 15 16
+    - mpirun -n 2 python $PWD/scripts/pfft-roundtrip-matrix.py -v -diag -Nproc 2 1 -Nproc 1 2 -Nmesh 13 15 16
+    - mpirun -n 4 python $PWD/scripts/pfft-roundtrip-matrix.py -v -diag -Nproc 2 2 -Nmesh 13 15 16
+    - mpirun -n 4 python $PWD/scripts/pfft-roundtrip-matrix.py -v -diag -Nproc 2 2 -Nmesh 13 15
     - bash check_tag.sh pfft/version.py
 
 deploy:
diff --git a/depends/install_pfft.sh b/depends/install_pfft.sh
index 23efd07..f0d6bce 100644
--- a/depends/install_pfft.sh
+++ b/depends/install_pfft.sh
@@ -7,7 +7,7 @@ OPTIMIZE1=`echo "$*" | sed 's;enable-sse2;enable-sse;'`
 echo "Optimization for double" ${OPTIMIZE}
 echo "Optimization for single" ${OPTIMIZE1}
 
-PFFT_VERSION=1.0.8-alpha3-fftw3
+PFFT_VERSION=1.0.8-alpha3-fftw3-2don2d
 TMP="tmp-pfft-$PFFT_VERSION"
 LOGFILE="build.log"
 
diff --git a/pfft/core.pyx b/pfft/core.pyx
index 21fb9f8..1b3e433 100644
--- a/pfft/core.pyx
+++ b/pfft/core.pyx
@@ -187,12 +187,42 @@ class Flags(int):
     PFFT_BUFFERED_INPLACE = _PFFT_BUFFERED_INPLACE
     PFFT_PADDED_R2C = _PFFT_PADDED_R2C
     PFFT_PADDED_C2R = _PFFT_PADDED_C2R
+    TRANSPOSED_NONE = _PFFT_TRANSPOSED_NONE
+    TRANSPOSED_IN = _PFFT_TRANSPOSED_IN
+    TRANSPOSED_OUT = _PFFT_TRANSPOSED_OUT
+    SHIFTED_NONE = _PFFT_SHIFTED_NONE
+    SHIFTED_IN = _PFFT_SHIFTED_IN
+    SHIFTED_OUT = _PFFT_SHIFTED_OUT
+    MEASURE = _PFFT_MEASURE
+    ESTIMATE = _PFFT_ESTIMATE
+    PATIENT = _PFFT_PATIENT
+    EXHAUSTIVE = _PFFT_EXHAUSTIVE
+    NO_TUNE = _PFFT_NO_TUNE
+    TUNE = _PFFT_TUNE
+    PRESERVE_INPUT = _PFFT_PRESERVE_INPUT
+    DESTROY_INPUT = _PFFT_DESTROY_INPUT
+    BUFFERED_INPLACE = _PFFT_BUFFERED_INPLACE
+    PADDED_R2C = _PFFT_PADDED_R2C
+    PADDED_C2R = _PFFT_PADDED_C2R
     def __new__(cls, value):
         self = int.__new__(cls, value)
         return self
     def __repr__(self):
         d = self.__class__.__dict__
-        return '|'.join([k for k in d.keys() if k.startswith('PFFT') and (d[k] & self)])
+        keys = sorted([k for k in d.keys() if k.isupper() and not k.startswith('PFFT')])
+        return '|'.join([k for k in keys if (d[k] & self)])
+
+    def format(self, flags=None):
+        d = self.__class__.__dict__
+        keys = sorted([k for k in d.keys() if k.isupper() and not k.startswith('PFFT')])
+        s = []
+        for key in keys:
+            if flags is not None and not (d[key] & flags): continue
+            if d[key] & self:
+                s.append(key)
+            else:
+                s.append(" " * len(key))
+        return ' '.join(s)
 
 class Direction(int):
     """ 
@@ -200,12 +230,15 @@ class Direction(int):
     """
     PFFT_FORWARD = _PFFT_FORWARD
     PFFT_BACKWARD = _PFFT_BACKWARD
+    FORWARD = _PFFT_FORWARD
+    BACKWARD = _PFFT_BACKWARD
     def __new__(cls, value):
         self = int.__new__(cls, value)
         return self
     def __repr__(self):
         d = self.__class__.__dict__
-        return 'and'.join([k for k in d.keys() if k.startswith('PFFT') and (d[k] == self)])
+        keys = sorted([k for k in d.keys() if k.isupper() and not k.startswith('PFFT')])
+        return 'and'.join([k for k in keys if (d[k] == self)])
 
 ######
 # define Type as the transform type
@@ -225,12 +258,21 @@ class Type(int):
     PFFTF_R2C = 5
     PFFTF_C2R = 6
     PFFTF_R2R = 7
+    C2C = 0
+    R2C = 1
+    C2R = 2
+    R2R = 3
+    C2CF = 4
+    R2CF = 5
+    C2RF = 6
+    R2RF = 7
     def __new__(cls, value):
         self = int.__new__(cls, value)
         return self
     def __repr__(self):
         d = self.__class__.__dict__
-        return 'and'.join([k for k in d.keys() if k.startswith('PFFT') and (d[k] == self)])
+        keys = sorted([k for k in d.keys() if k.isupper() and not k.startswith('PFFT')])
+        return 'and'.join([k for k in keys if (d[k] == self)])
 
 ctypedef numpy.intp_t (*pfft_local_size_func)(int rnk_n, numpy.intp_t * n, cMPI.MPI_Comm comm, int
             pfft_flags, numpy.intp_t * local_ni, numpy.intp_t * local_i_start,
@@ -484,14 +526,20 @@ cdef class Partition(object):
 
         local_ni, local_no, local_i_start, local_o_start = numpy.empty((4, n_.shape[0]), 'intp')
 
+        self.type = Type(type)
+        self.flags = Flags(flags)
+
         if len(n_) < len(procmesh.np):
             raise ValueError("ProcMesh (%d) shall have less dimentions than Mesh (%d)" % (len(procmesh.np), len(n_)))
 
-        if len(n_) == len(procmesh.np): # https://github.com/mpip/pfft/issues/29
-            raise NotImplementedError("Currently using the same ProcMesh (%d) dimentions with Mesh (%d) is not supported." % (len(procmesh.np), len(n_)))
+        if len(n_) == len(procmesh.np):
+            if len(n_) != 2 and len(n_) != 3: # https://github.com/mpip/pfft/issues/29
+                raise NotImplementedError("Currently using the same ProcMesh (%d) dimentions with Mesh (%d) is not supported other than 2don2d or 3don3d" % (len(procmesh.np), len(n_)))
+            if (self.flags & Flags.PFFT_PADDED_R2C) | (self.flags & Flags.PFFT_PADDED_C2R):
+                if self.type in (Type.R2C, Type.C2R, Type.R2CF, Type.C2RF):
+                    # https://github.com/mpip/pfft/pull/31
+                    raise NotImplementedError("Currently using the same ProcMesh (%d) dimentions with Mesh (%d) is not supported on padded transforms." % (len(procmesh.np), len(n_)))
 
-        self.type = Type(type)
-        self.flags = Flags(flags)
         cdef pfft_local_size_func func = PFFT_LOCAL_SIZE_FUNC[self.type]
 
 
@@ -774,7 +822,9 @@ cdef class Plan(object):
             inplace = False
         if inplace != self.inplace:
             raise ValueError("inplace status mismatch with the plan")
+
         func(self.plan, i.ptr, o.ptr)
+
     def __repr__(self):
         return "Plan(" + \
                 ','.join([
diff --git a/pfft/tests/test_pfft.py b/pfft/tests/test_pfft.py
index acb583e..5d5f632 100644
--- a/pfft/tests/test_pfft.py
+++ b/pfft/tests/test_pfft.py
@@ -196,3 +196,65 @@ def test_leak(comm):
         buffer = pfft.LocalBuffer(partition)
         #FIXME: check with @mpip if this is correct.
         i = buffer.view_input()
+
+@MPITest([4])
+def test_2d_on_2d_c2c(comm):
+    procmesh = pfft.ProcMesh(np=[2, 2], comm=comm)
+    N = (8, 8)
+
+    data = numpy.arange(numpy.prod(N), dtype='complex128').reshape(N)
+
+    correct = numpy.fft.fftn(data.copy())
+    result = numpy.zeros_like(correct)
+
+    partition = pfft.Partition(pfft.Type.PFFT_C2C, N,
+        procmesh, flags=pfft.Flags.PFFT_ESTIMATE
+          | pfft.Flags.PFFT_TRANSPOSED_OUT
+#          | pfft.Flags.PFFT_DESTROY_INPUT
+          | pfft.Flags.PFFT_PRESERVE_INPUT
+        )
+
+    buffer1 = pfft.LocalBuffer(partition)
+    buffer2 = pfft.LocalBuffer(partition)
+
+    plan = pfft.Plan(partition, pfft.Direction.PFFT_FORWARD, buffer1, buffer2)
+
+    buffer1.view_input()[:] = data[partition.local_i_slice]
+    plan.execute(buffer1, buffer2)
+
+    result[partition.local_o_slice] = buffer2.view_output()
+    result = comm.allreduce(result)
+    assert_almost_equal(correct, result)
+
+@MPITest([1, 4])
+def test_2d_on_2d_r2c(comm):
+    if comm.size == 1:
+        procmesh = pfft.ProcMesh(np=[1, 1], comm=comm)
+    else:
+        procmesh = pfft.ProcMesh(np=[2, 2], comm=comm)
+    N = (8, 8)
+
+    data = numpy.arange(numpy.prod(N), dtype='f8').reshape(N)
+
+    correct = numpy.fft.rfftn(data.copy())
+    result = numpy.zeros_like(correct)
+
+    partition = pfft.Partition(pfft.Type.PFFT_R2C, N,
+        procmesh, flags=pfft.Flags.PFFT_ESTIMATE
+          | pfft.Flags.PFFT_TRANSPOSED_OUT
+#          | pfft.Flags.PFFT_DESTROY_INPUT
+          | pfft.Flags.PFFT_PRESERVE_INPUT
+#          | pfft.Flags.PADDED_R2C # doesn't work yet
+        )
+
+    buffer1 = pfft.LocalBuffer(partition)
+    buffer2 = pfft.LocalBuffer(partition)
+
+    plan = pfft.Plan(partition, pfft.Direction.PFFT_FORWARD, buffer1, buffer2)
+
+    buffer1.view_input()[:] = data[partition.local_i_slice]
+    plan.execute(buffer1, buffer2)
+
+    result[partition.local_o_slice] = buffer2.view_output()
+    result = comm.allreduce(result)
+    assert_almost_equal(correct, result)
diff --git a/tests/roundtrip.py b/scripts/pfft-roundtrip-matrix.py
similarity index 57%
rename from tests/roundtrip.py
rename to scripts/pfft-roundtrip-matrix.py
index aa2e44c..964926c 100644
--- a/tests/roundtrip.py
+++ b/scripts/pfft-roundtrip-matrix.py
@@ -11,16 +11,22 @@
        inplace transform 
 
    Examples:
+
+   * to run in source code, first get a shell with
+       python runtests.py --shell
+
    * for single-rank numpy agreement test, run with
-       mpirun -np 1 python roundtrip.py -Nmesh 32 32 32 -Nmesh 3 3 3 -tree -verbose
+       mpirun -np 1 python roundtrip.py -Nmesh 32 32 32 -Nmesh 3 3 3 -verbose
 
    * for multi-rank tests, run with 
-       mpirun -np n python roundtrip.py -Nmesh 32 32 32 -Nmesh 3 3 3 -tree -verbose
+       mpirun -np 4 python roundtrip.py -Nmesh 32 32 32 -Nmesh 3 3 3 --verbose
 
    n can be any number. procmeshes tested are:
        np = [n], [1, n], [n, 1], [a, d], [d, a]
     where a * d == n and a d are closest to n** 0.5
 """
+from __future__ import print_function
+
 from mpi4py import MPI
 import itertools
 import traceback
@@ -28,52 +34,46 @@
 import argparse
 
 import os.path
-from sys import path
 
 parser = argparse.ArgumentParser(description='Roundtrip testing of pfft', 
         epilog=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter 
         )
 
-parser.add_argument('-Nmesh', nargs=3, type=int,
-        action='append', metavar=('Nx', 'Ny', 'Nz'), 
+from pfft import *
+
+oldprint = print
+def print(*args, **kwargs):
+    if MPI.COMM_WORLD.rank == 0:
+        oldprint(*args, **kwargs)
+
+parser.add_argument('-Nmesh', nargs='+', type=int,
+        action='append',
         help='size of FFT mesh, default is 29 30 31',
         default=[])
-parser.add_argument('-Nproc', nargs=2, type=int,
-        action='append', metavar=('Nx', 'Ny'), 
+parser.add_argument('-Nproc', nargs='+', type=int,
+        action='append',
         help='proc mesh',
         default=[])
-parser.add_argument('-tree', action='store_true', default=False,
-        help='Use pfft from source tree, ' +
-        'built with setup.py build_ext --inplace')
 parser.add_argument('-diag', action='store_true', default=False,
         help='show which one failed and which one passed')
+parser.add_argument('-rigor', default="estimate", choices=['estimate', 'measure', 'patient', 'exhaustive'],
+        help='the level of rigor in planning. ')
 parser.add_argument('-verbose', action='store_true', default=False,
         help='print which test will be ran')
 
-ns = parser.parse_args()
-Nmesh = ns.Nmesh
-if len(Nmesh) == 0:
-    # default 
-    Nmesh = [[29, 30, 31]]
-if ns.tree:
-    # prefers to use the locally built pfft in source tree, in case there is an
-    # installation
-    path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-
-from pfft import *
-
 class LargeError(Exception):
     pass
 
 def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
+
     partition = Partition(type, Nmesh, procmesh, flags)
     for rank in range(MPI.COMM_WORLD.size):
         MPI.COMM_WORLD.barrier()
         if rank != procmesh.rank:
             continue
-        #print procmesh.rank, 'roundtrip test, np=', procmesh.np, 'Nmesh = ', Nmesh, 'inplace = ', inplace
-        #print repr(partition)
+        #oldprint(procmesh.rank, 'roundtrip test, np=', procmesh.np, 'Nmesh = ', Nmesh, 'inplace = ', inplace)
+        #oldprint(repr(partition))
 
     buf1 = LocalBuffer(partition)
     if inplace:
@@ -96,9 +96,7 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
             buf2,
             type=type,
             flags=flags)
-    if procmesh.rank == 0:
-        #print repr(forward)
-        pass
+    # print(repr(forward))
 
     # find the inverse plan
     typemap = {
@@ -139,9 +137,7 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
             type=btype, 
             flags=bflags,
             )
-    if procmesh.rank == 0:
-        #print repr(backward)
-        pass
+    #print(repr(backward))
 
     numpy.random.seed(9999)
 
@@ -183,8 +179,7 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
         MPI.COMM_WORLD.barrier()
         if rank != procmesh.rank:
             continue
-        if False:
-            print('error', original - input)
+        # oldprint('error', original - input)
         MPI.COMM_WORLD.barrier()
     if False:
         print(repr(forward.type), 'forward', "error = ", r2cerr)
@@ -193,29 +188,43 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
     r2cerr = MPI.COMM_WORLD.allreduce(r2cerr, MPI.MAX)
     c2rerr = MPI.COMM_WORLD.allreduce(c2rerr, MPI.MAX)
     if (r2cerr > 5e-4):
-        raise LargeError("r2c: %g" % r2cerr)
+        raise LargeError("forward: %g" % r2cerr)
 
     if (c2rerr > 5e-4):
-        raise LargeError("c2r: %g" % c2rerr)
-
-if MPI.COMM_WORLD.size == 1: 
-    nplist = [
-            [1],
-            [1, 1],
-            ]
-else:
-    nplist = ns.Nproc
-            
-
-try:
-    flags = [
-            Flags.PFFT_ESTIMATE | Flags.PFFT_DESTROY_INPUT,
-            Flags.PFFT_ESTIMATE | Flags.PFFT_PADDED_R2C | Flags.PFFT_DESTROY_INPUT,
-            Flags.PFFT_ESTIMATE | Flags.PFFT_PADDED_R2C,
-            Flags.PFFT_ESTIMATE | Flags.PFFT_TRANSPOSED_OUT,
-            Flags.PFFT_ESTIMATE | Flags.PFFT_TRANSPOSED_OUT | Flags.PFFT_DESTROY_INPUT,
-            Flags.PFFT_ESTIMATE | Flags.PFFT_PADDED_R2C | Flags.PFFT_TRANSPOSED_OUT,
-            ]
+        raise LargeError("backward: %g" % c2rerr)
+
+def main():
+
+    ns = parser.parse_args()
+    Nmesh = ns.Nmesh
+
+    if len(Nmesh) == 0:
+        # default 
+        Nmesh = [[29, 30, 31]]
+
+    if MPI.COMM_WORLD.size == 1 and len(ns.Nproc) == 0:
+        nplist = [ [1], [1, 1], ]
+    else:
+        nplist = ns.Nproc
+
+    rigor = {
+            'exhaustive': Flags.PFFT_EXHAUSTIVE,
+            'patient' : Flags.PFFT_PATIENT,
+            'estimate' : Flags.PFFT_ESTIMATE,
+            'measure' : Flags.PFFT_MEASURE,
+            }[ns.rigor]
+    import itertools
+    import functools
+
+    flags = []
+    matrix = Flags.PFFT_DESTROY_INPUT, Flags.PFFT_PADDED_R2C, Flags.PFFT_TRANSPOSED_OUT
+    print_flags = functools.reduce(lambda x, y: x | y, matrix, rigor)
+
+    matrix2 = [[0, i] for i in matrix]
+    for row in itertools.product(*matrix2):
+        flag = functools.reduce(lambda x, y: x | y, row, rigor)
+        flags.append(flag)
+
     params = list(itertools.product(
             nplist, [Type.PFFT_C2C, Type.PFFT_R2C, Type.PFFTF_C2C, Type.PFFTF_R2C], flags, [True, False],
             Nmesh,
@@ -223,11 +232,11 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
 
     PASS = []
     FAIL = []
+    IMPL = []
     for param in params:
-        if MPI.COMM_WORLD.rank == 0:
-            if ns.verbose:
-                f = param
-                print("NP", f[0], repr(Type(f[1])), repr(Flags(f[2])), "InPlace", f[3], "Nmesh", f[4])
+        if ns.verbose:
+            f = param
+            print("NP", f[0], repr(Type(f[1])), repr(Flags(f[2])), "InPlace", f[3], "Nmesh", f[4])
         np = param[0]
         procmesh = ProcMesh(np=np)
         try:
@@ -236,19 +245,78 @@ def test_roundtrip_3d(procmesh, type, flags, inplace, Nmesh):
         except LargeError as e:
             if ns.verbose:
                 f = param
-                print("Failed", e)
+                print("Failed", f, e)
             FAIL.append((param, e))
+        except NotImplementedError as e:
+            if ns.verbose:
+                f = param
+                print("notsupported", f, e)
+            IMPL.append((param, e))
+
+    N = len(PASS) + len(FAIL) + len(IMPL)
+
+    print("PASS", len(PASS), '/', N)
+
+    if ns.diag:
+        printcase("", "", print_flags, header=True)
+        for f in PASS:
+            printcase(f, "", print_flags, )
+
+    print("UNIMPL", len(IMPL), '/', N)
+    if ns.diag:
+        printcase("", "", print_flags, header=True)
+        for f, e in IMPL:
+            printcase(f, e, print_flags)
+
+    print("FAIL", len(FAIL), '/', N)
+    if ns.diag:
+        printcase("", "", print_flags, header=True)
+        for f, e in FAIL:
+            printcase(f, e, print_flags)
+
+    if len(FAIL) != 0:
+        return 1
+
+    return 0
+
+def printcase(f, e, flags, header=False):
+    if header:
+        inplace = "INPLACE"
+        np = "NP"
+        flags = "FLAGS"
+        type = "TYPE"
+        nmesh = "NMESH"
+        error = "ERROR"
+    else:
+        inplace = "INPL" if f[3] else "OUTP"
+        np = str(f[0])
+        flags = Flags(f[2]).format(flags)
+        type = repr(Type(f[1]))
+        nmesh = str(f[4])
+        error = str(e)
+    print("%(np)-6s %(nmesh)-8s %(type)-6s %(inplace)-6s %(flags)-80s %(error)-s" % locals())
+
+# use unbuffered stdout
+class Unbuffered(object):
+   def __init__(self, stream):
+       self.stream = stream
+   def write(self, data):
+       self.stream.write(data)
+       self.stream.flush()
+   def writelines(self, datas):
+       self.stream.writelines(datas)
+       self.stream.flush()
+   def __getattr__(self, attr):
+       return getattr(self.stream, attr)
+
+import sys
+sys.stdout = Unbuffered(sys.stdout)
+
+if __name__ == '__main__':
+
+    try:
+        sys.exit(main())
+    except Exception as e:
+        print(traceback.format_exc())
+        MPI.COMM_WORLD.Abort()
 
-    if MPI.COMM_WORLD.rank == 0:
-        print("PASS", len(PASS), '/', len(params))
-        if ns.diag:
-            for f in PASS:
-                print("NP", f[0], repr(Type(f[1])), repr(Flags(f[2])), "InPlace", f[3], "Nmesh", f[4])
-        print("FAIL", len(FAIL), '/', len(params))
-        if ns.diag:
-            for f, e in FAIL:
-                print("NP", f[0], repr(Type(f[1])), repr(Flags(f[2])), "InPlace", f[3], "Nmesh", f[4], e)
-        assert len(FAIL) == 0
-except Exception as e:
-    print(traceback.format_exc())
-    MPI.COMM_WORLD.Abort()
diff --git a/setup.py b/setup.py
index 00dc827..b7c4d11 100644
--- a/setup.py
+++ b/setup.py
@@ -104,6 +104,7 @@ def find_version(path):
                 cython_directives = {"embedsignature": True}
                 )]),
     license='GPL3',
+    scripts=['scripts/pfft-roundtrip-matrix.py'],
     cmdclass = {
         "build_py":build_py,
         "build_ext": build_ext_subclass}