From f29d430b64d6b17f4645ef3510947be6a1a6f399 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Tue, 7 Jul 2015 15:05:03 -0700 Subject: [PATCH 01/35] use embeded fftw3 as a subpackage This hugely simplifies the building process of pfft. Since the new planners are unlikely to be included in next release of FFTW, it makes sense to just ship in a release the patched version of FFTW that is known to work with PFFT. touch fftw3/Changelog --- .gitmodules | 3 +++ Makefile.am | 9 ++++++--- api/Makefile.am | 4 ++++ bootstrap.sh | 4 ++-- configure.ac | 36 ++++-------------------------------- fftw3 | 1 + gcell/Makefile.am | 4 ++++ kernel/Makefile.am | 4 ++++ tests/Makefile.am | 8 +++++++- tests/f03/Makefile.am | 6 ++++++ tests/fortran/Makefile.am | 11 ++++++++++- util/Makefile.am | 4 ++++ 12 files changed, 55 insertions(+), 39 deletions(-) create mode 100644 .gitmodules create mode 160000 fftw3 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..acb7661 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "fftw3"] + path = fftw3 + url = https://github.com/rainwoodman/fftw3 diff --git a/Makefile.am b/Makefile.am index eb1ccd6..7f6387d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,7 +16,7 @@ AM_DISTCHECK_CONFIGURE_FLAGS = \ "LDFLAGS=@LDFLAGS@" # Traverse these subdirectories, the current one first (for config.h). -SUBDIRS = kernel gcell util api . +SUBDIRS = fftw3 kernel gcell util api . if ENABLE_TESTS SUBDIRS += tests @@ -43,10 +43,11 @@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LIBADD = \ kernel/libkernel.la \ util/libutil.la \ api/libapi.la \ - gcell/libgcell.la + gcell/libgcell.la \ + fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LDFLAGS = -no-undefined -version-info @SHARED_VERSION_INFO@ -lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) +lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) -I $(top_srcdir)/mpi -I $(top_srcdir)/api # Get Fortran compile rules that include preprocessing. include $(top_srcdir)/build-aux/fortran-rules.am @@ -58,6 +59,8 @@ pfft1@PREC_SUFFIX@@OPENMP_SUFFIX@.pc: pfft.pc pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = pfft.pc +DISTCHECK_CONFIGURE_FLAGS=--disable-shared --disable-doc --enable-mpi --enable-fortran + ################################################################# # Documentation ################################################################# diff --git a/api/Makefile.am b/api/Makefile.am index 596a3cb..6b2ff47 100644 --- a/api/Makefile.am +++ b/api/Makefile.am @@ -3,6 +3,10 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_CFLAGS = $(OPENMP_CFLAGS) diff --git a/bootstrap.sh b/bootstrap.sh index 2d34044..b37d5b9 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -35,12 +35,12 @@ alias libtoolize=$(type -p glibtoolize libtoolize | head -1) touch ChangeLog +touch fftw3/ChangeLog echo "PLEASE IGNORE WARNINGS AND ERRORS" - rm -rf autom4te.cache libtoolize -autoreconf --verbose --install --force +autoreconf --verbose --install --symlink --force rm -f config.cache diff --git a/configure.ac b/configure.ac index 796603c..279dd73 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])]) @@ -263,6 +263,7 @@ SHARED_VERSION_INFO="0:0:0" # substitute SHARED_VERSION_INFO in generated Makefiles AC_SUBST(SHARED_VERSION_INFO) +AC_DISABLE_SHARED dnl to hell with shared libraries ################################################################################ # program checks @@ -280,37 +281,6 @@ AC_SUBST(SHARED_VERSION_INFO) # May need sincos from libm. AC_CHECK_LIB([m], [sincos]) -# Check for FFTW3, MPI FFTW and threaded FFTW. -if test "x$PRECISION" = "xs" ; then - AX_LIB_FFTW3F -elif test "x$PRECISION" = "xl" ; then - AX_LIB_FFTW3L -else - AX_LIB_FFTW3 -fi - -if test "x$ax_lib_fftw3" = "xno"; then - AC_MSG_ERROR([You do not seem to have the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure ] - [--help for more information.]) -fi - -if test "x$ax_lib_fftw3_mpi" = "xno"; then - AC_MSG_ERROR([You do not seem to have the MPI part of the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure --help] - [for more information.]) -fi - -if test "x$enable_openmp" = "xyes" -a "x$ax_lib_fftw3_openmp" = "xno"; then - AC_MSG_ERROR([You do not seem to have the OpenMP part of the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure --help] - [for more information.]) -fi - - ################################################################################ # compiler options ################################################################################ @@ -482,4 +452,6 @@ AC_CONFIG_LINKS([tests/build_checks.sh:tests/build_checks.sh tests/run_checks.sh:tests/run_checks.sh tests/manual_c2c_3d.c:tests/manual_c2c_3d.c]) +AC_CONFIG_SUBDIRS([fftw3]) + AC_OUTPUT diff --git a/fftw3 b/fftw3 new file mode 160000 index 0000000..02d96b9 --- /dev/null +++ b/fftw3 @@ -0,0 +1 @@ +Subproject commit 02d96b9ff8935c2952ce8539ce0e4800aeca781f diff --git a/gcell/Makefile.am b/gcell/Makefile.am index 9f28d98..a0f7bd9 100644 --- a/gcell/Makefile.am +++ b/gcell/Makefile.am @@ -6,6 +6,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/api # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libgcell.la diff --git a/kernel/Makefile.am b/kernel/Makefile.am index 9e3ad52..307e2b5 100644 --- a/kernel/Makefile.am +++ b/kernel/Makefile.am @@ -6,6 +6,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/api # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libkernel.la diff --git a/tests/Makefile.am b/tests/Makefile.am index 9cdef9f..901c754 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -19,12 +19,18 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of pfft.h AM_CPPFLAGS += -I$(top_srcdir)/api +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_CFLAGS = $(OPENMP_CFLAGS) # Libraries to add to all programs that are built. -LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la EXTRA_DIST = \ diff --git a/tests/f03/Makefile.am b/tests/f03/Makefile.am index d5e7945..6122e10 100644 --- a/tests/f03/Makefile.am +++ b/tests/f03/Makefile.am @@ -3,12 +3,18 @@ include $(top_srcdir)/build-aux/fortran-rules.am # Directory of pnfft.f (which is build first) AM_FCCPPFLAGS = -I$(top_builddir)/api +# +# Directory of fftw3 +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_FCFLAGS = $(OPENMP_FCFLAGS) # Libraries to add to all programs that are built. LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la # noinst_LTLIBRARIES = libtests.la diff --git a/tests/fortran/Makefile.am b/tests/fortran/Makefile.am index 88b9a98..6adee68 100644 --- a/tests/fortran/Makefile.am +++ b/tests/fortran/Makefile.am @@ -4,11 +4,20 @@ include $(top_srcdir)/build-aux/fortran-rules.am # Directory of pfft.f (which is build first) AM_FCCPPFLAGS = -I$(top_builddir)/api +# Directory of pfft.f (which is build first) +AM_FCCPPFLAGS = -I$(top_builddir)/api + +# Directory of fftw3 +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/mpi + # OpenMP support AM_FCFLAGS = $(OPENMP_FCFLAGS) # Libraries to add to all programs that are built. -LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la # noinst_LTLIBRARIES = libtests.la diff --git a/util/Makefile.am b/util/Makefile.am index 9b70925..2885f4e 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -3,6 +3,10 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of pfft.h AM_CPPFLAGS += -I$(top_srcdir)/api +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libutil.la From d03c15e29d2715443808cafd9f84d421c2fc1468 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 16 Sep 2015 13:14:37 -0700 Subject: [PATCH 02/35] Better omp support? --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 279dd73..e3c9b2e 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha1-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])]) From a04e48715a55cd01179c371c5400d970c9ad636f Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 16 Sep 2015 13:23:57 -0700 Subject: [PATCH 03/35] Better omp support --- Makefile.am | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 7f6387d..c7ff816 100644 --- a/Makefile.am +++ b/Makefile.am @@ -44,7 +44,6 @@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LIBADD = \ util/libutil.la \ api/libapi.la \ gcell/libgcell.la \ - fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LDFLAGS = -no-undefined -version-info @SHARED_VERSION_INFO@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) -I $(top_srcdir)/mpi -I $(top_srcdir)/api From 7641c1094526b56b8108d06357e32a6d8526f7bd Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sun, 5 Jun 2016 14:21:27 -0700 Subject: [PATCH 04/35] avoid backslash --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index c7ff816..ef5dc59 100644 --- a/Makefile.am +++ b/Makefile.am @@ -43,7 +43,7 @@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LIBADD = \ kernel/libkernel.la \ util/libutil.la \ api/libapi.la \ - gcell/libgcell.la \ + gcell/libgcell.la lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LDFLAGS = -no-undefined -version-info @SHARED_VERSION_INFO@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) -I $(top_srcdir)/mpi -I $(top_srcdir)/api From 8985ddda244ca9b9543bfe5ec82329ae86bcdbae Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sun, 5 Jun 2016 15:46:14 -0700 Subject: [PATCH 05/35] update fftw3. --- fftw3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fftw3 b/fftw3 index 02d96b9..b1a807b 160000 --- a/fftw3 +++ b/fftw3 @@ -1 +1 @@ -Subproject commit 02d96b9ff8935c2952ce8539ce0e4800aeca781f +Subproject commit b1a807bdf2cd183754e9b8de3f4e2d948832f959 From 80f5f6760a0c1cd3b2a1ae8a5a472a9044e4bffb Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sun, 5 Jun 2016 15:47:46 -0700 Subject: [PATCH 06/35] bump version --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index e3c9b2e..5290a71 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha1-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha2-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])]) From 7438be04c10f4e8dbb842507f2f5ec1859005d68 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 4 Dec 2017 14:16:58 -0800 Subject: [PATCH 07/35] bump version and embedded fftw --- configure.ac | 2 +- fftw3 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 5290a71..9683c4f 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha2-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha3-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])]) diff --git a/fftw3 b/fftw3 index b1a807b..9db17dc 160000 --- a/fftw3 +++ b/fftw3 @@ -1 +1 @@ -Subproject commit b1a807bdf2cd183754e9b8de3f4e2d948832f959 +Subproject commit 9db17dc1171bd03312d7460da6886cae5dca2022 From a889f7d0e6144c02d089e8044a6066ede921a6e9 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 4 Dec 2017 14:21:14 -0800 Subject: [PATCH 08/35] use fftw 3.3.7 --- fftw3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fftw3 b/fftw3 index 9db17dc..7a893f4 160000 --- a/fftw3 +++ b/fftw3 @@ -1 +1 @@ -Subproject commit 9db17dc1171bd03312d7460da6886cae5dca2022 +Subproject commit 7a893f428f093c034457de69011d6acb39aafd6f From b64ae80a9fb11e7b8f70f1c453019d3a99b86c43 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sat, 13 Jan 2018 00:35:30 -0800 Subject: [PATCH 09/35] add 2d on 2d test case. --- tests/Makefile.am | 3 ++ tests/simple_check_r2c_2d_on_2d.c | 80 +++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 tests/simple_check_r2c_2d_on_2d.c diff --git a/tests/Makefile.am b/tests/Makefile.am index 901c754..6482188 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -106,6 +106,9 @@ check_PROGRAMS += \ check_PROGRAMS += \ simple_check_r2c_3d_on_3d_transposed_many +check_PROGRAMS += \ + simple_check_r2c_2d_on_2d + check_PROGRAMS += \ simple_check_ousam_c2c simple_check_ousam_c2c_transposed \ simple_check_ousam_c2r \ diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c new file mode 100644 index 0000000..e79509f --- /dev/null +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -0,0 +1,80 @@ +#include +#include + +int main(int argc, char **argv) +{ + int np[3]; + ptrdiff_t n[3]; + ptrdiff_t alloc_local; + ptrdiff_t local_ni[3], local_i_start[3]; + ptrdiff_t local_no[3], local_o_start[3]; + double err, *in; + pfft_complex *out; + pfft_plan plan_forw=NULL, plan_back=NULL; + MPI_Comm comm_cart_3d; + + /* Set size of FFT and process mesh */ + n[0] = 29; n[1] = 27; n[2] = 31; + np[0] = 2; np[1] = 2; np[2] = 2; + + /* Initialize MPI and PFFT */ + MPI_Init(&argc, &argv); + pfft_init(); + + /* Create three-dimensional process grid of size np[0] x np[1] x np[2], if possible */ + if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_3d) ){ + pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]*np[2]); + MPI_Finalize(); + return 1; + } + + /* Get parameters of data distribution */ + alloc_local = pfft_local_size_dft_r2c(2, n, comm_cart_3d, PFFT_TRANSPOSED_NONE, + local_ni, local_i_start, local_no, local_o_start); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "%td.\n", alloc_local); + + /* Allocate memory */ + in = pfft_alloc_real(2 * alloc_local); + out = pfft_alloc_complex(alloc_local); + + /* Plan parallel forward FFT */ + plan_forw = pfft_plan_dft_r2c(2, + n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + + /* Plan parallel backward FFT */ + plan_back = pfft_plan_dft_c2r(2, + n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + + /* Initialize input with random numbers */ + pfft_init_input_real(2, n, local_ni, local_i_start, + in); + + /* execute parallel forward FFT */ + pfft_execute(plan_forw); + + /* clear the old input */ + pfft_clear_input_real(2, n, local_ni, local_i_start, + in); + + /* execute parallel backward FFT */ + pfft_execute(plan_back); + + /* Scale data */ + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++) + in[l] /= (n[0]*n[1]*n[2]); + + /* Print error of back transformed data */ + MPI_Barrier(MPI_COMM_WORLD); + err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_3d); + pfft_printf(comm_cart_3d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); + pfft_printf(comm_cart_3d, "maxerror = %6.2e;\n", err); + + /* free mem and finalize */ + pfft_destroy_plan(plan_forw); + pfft_destroy_plan(plan_back); + MPI_Comm_free(&comm_cart_3d); + pfft_free(in); pfft_free(out); + MPI_Finalize(); + return 0; +} From bd0e8d3ede3a49ea4d57d5ae8d64d1d426e58694 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sat, 13 Jan 2018 00:49:17 -0800 Subject: [PATCH 10/35] update 2don2d --- tests/simple_check_r2c_2d_on_2d.c | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c index e79509f..3443e48 100644 --- a/tests/simple_check_r2c_2d_on_2d.c +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -3,33 +3,33 @@ int main(int argc, char **argv) { - int np[3]; - ptrdiff_t n[3]; + int np[2]; + ptrdiff_t n[2]; ptrdiff_t alloc_local; - ptrdiff_t local_ni[3], local_i_start[3]; - ptrdiff_t local_no[3], local_o_start[3]; + ptrdiff_t local_ni[2], local_i_start[2]; + ptrdiff_t local_no[2], local_o_start[2]; double err, *in; pfft_complex *out; pfft_plan plan_forw=NULL, plan_back=NULL; - MPI_Comm comm_cart_3d; + MPI_Comm comm_cart_2d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; n[2] = 31; - np[0] = 2; np[1] = 2; np[2] = 2; + n[0] = 29; n[1] = 27; + np[0] = 2; np[1] = 2; /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); pfft_init(); - /* Create three-dimensional process grid of size np[0] x np[1] x np[2], if possible */ - if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_3d) ){ - pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]*np[2]); + /* Create three-dimensional process grid of size np[0] x np[1], if possible */ + if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_2d) ){ + pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]); MPI_Finalize(); return 1; } /* Get parameters of data distribution */ - alloc_local = pfft_local_size_dft_r2c(2, n, comm_cart_3d, PFFT_TRANSPOSED_NONE, + alloc_local = pfft_local_size_dft_r2c(2, n, comm_cart_2d, PFFT_TRANSPOSED_NONE, local_ni, local_i_start, local_no, local_o_start); pfft_fprintf(MPI_COMM_WORLD, stderr, "%td.\n", alloc_local); @@ -40,11 +40,11 @@ int main(int argc, char **argv) /* Plan parallel forward FFT */ plan_forw = pfft_plan_dft_r2c(2, - n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); /* Plan parallel backward FFT */ plan_back = pfft_plan_dft_c2r(2, - n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); /* Initialize input with random numbers */ pfft_init_input_real(2, n, local_ni, local_i_start, @@ -61,19 +61,19 @@ int main(int argc, char **argv) pfft_execute(plan_back); /* Scale data */ - for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++) - in[l] /= (n[0]*n[1]*n[2]); + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) + in[l] /= (n[0]*n[1]); /* Print error of back transformed data */ MPI_Barrier(MPI_COMM_WORLD); - err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_3d); - pfft_printf(comm_cart_3d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); - pfft_printf(comm_cart_3d, "maxerror = %6.2e;\n", err); + err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_2d); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1]); + pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); /* free mem and finalize */ pfft_destroy_plan(plan_forw); pfft_destroy_plan(plan_back); - MPI_Comm_free(&comm_cart_3d); + MPI_Comm_free(&comm_cart_2d); pfft_free(in); pfft_free(out); MPI_Finalize(); return 0; From e56d072456de799bae5e8f8e0b674f25d21fe3bf Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 14:00:22 -0800 Subject: [PATCH 11/35] add generic remap_nd interface. --- api/api-adv.c | 28 +--- api/api-basic.c | 12 +- kernel/Makefile.am | 1 + kernel/ipfft.h | 106 +++++++++----- kernel/partrafo.c | 54 +++---- kernel/procmesh.c | 209 --------------------------- kernel/remap.c | 306 +++++++++++++++++++++++++++++++++++++++ kernel/remap_3dto2d.c | 322 ++++++++++++++++++++++++------------------ kernel/timer.c | 20 +-- util/util.c | 11 -- util/util.h | 3 - 11 files changed, 604 insertions(+), 468 deletions(-) create mode 100644 kernel/remap.c diff --git a/api/api-adv.c b/api/api-adv.c index 596760e..e1edca8 100644 --- a/api/api-adv.c +++ b/api/api-adv.c @@ -22,11 +22,6 @@ #include "ipfft.h" #include "util.h" -static void calculate_3dto2d_blocks( - const INT *n, MPI_Comm comm_cart_3d, - INT *iblk); - - INT PX(local_size_many_dft)( int rnk_n, const INT *n, const INT *ni, const INT *no, INT howmany, const INT *iblock, const INT *oblock, @@ -238,7 +233,7 @@ PX(gcplan) PX(plan_many_rgc)( ) { int rnk_pm; - INT blk_3dto2d[3]; + INT blk_for_remap[3]; MPI_Comm *comms_pm; PX(gcplan) ths; MPI_Comm comm_cart = PX(assure_cart_comm)(comm); @@ -256,13 +251,13 @@ PX(gcplan) PX(plan_many_rgc)( comms_pm = (MPI_Comm*) malloc(sizeof(MPI_Comm) * (size_t) rnk_pm); PX(split_cart_procmesh)(comm_cart, comms_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ + if( PX(needs_remap_nd)(rnk_n, comm_cart) ){ /* 3d to 2d remap results in complicated blocks. * We ignore users input and use default block size. */ - calculate_3dto2d_blocks(pn, comm_cart, - blk_3dto2d); + PX(remap_nd_calculate_blocks)(rnk_n, pn, comm_cart, + blk_for_remap); ths = PX(plan_rgc_internal)( - rnk_n, pn, howmany, blk_3dto2d, gc_below, gc_above, + rnk_n, pn, howmany, blk_for_remap, gc_below, gc_above, data, rnk_pm, comms_pm, comm_cart, gc_flags); } else ths = PX(plan_rgc_internal)( @@ -279,19 +274,6 @@ PX(gcplan) PX(plan_many_rgc)( return ths; } -static void calculate_3dto2d_blocks( - const INT *n, MPI_Comm comm_cart_3d, - INT *iblk - ) -{ - int q0, q1, p0, p1; - INT mblk[3], oblk[3]; - - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - PX(default_block_size_3dto2d)(n, p0, p1, q0, q1, - iblk, mblk, oblk); -} - PX(gcplan) PX(plan_many_cgc)( int rnk_n, const INT *n, INT howmany, const INT *block, diff --git a/api/api-basic.c b/api/api-basic.c index aab76be..415cb78 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -1076,9 +1076,9 @@ static void PX(execute_full)( twiddle_input(ths, ths->in, ths->out, in, out); PFFT_FINISH_TIMING(ths->timer->itwiddle); - PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_3dto2d[0]); - PX(execute_remap_3dto2d)(ths->remap_3dto2d[0], ths->in, ths->out, in, out); - PFFT_FINISH_TIMING(ths->timer->remap_3dto2d[0]); + PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[0]); + PX(execute_remap_nd)(ths->remap_nd[0], ths->in, ths->out, in, out); + PFFT_FINISH_TIMING(ths->timer->remap_nd[0]); execute_transposed(r, ths->serial_trafo, ths->global_remap, ths->timer->trafo, ths->timer->remap, ths->in, ths->out, in, out, ths->comm_cart); @@ -1086,9 +1086,9 @@ static void PX(execute_full)( execute_transposed(r, &ths->serial_trafo[r+1], &ths->global_remap[r], &ths->timer->trafo[r+1], &ths->timer->remap[r], ths->in, ths->out, in, out, ths->comm_cart); - PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_3dto2d[1]); - PX(execute_remap_3dto2d)(ths->remap_3dto2d[1], ths->in, ths->out, in, out); - PFFT_FINISH_TIMING(ths->timer->remap_3dto2d[1]); + PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[1]); + PX(execute_remap_nd)(ths->remap_nd[1], ths->in, ths->out, in, out); + PFFT_FINISH_TIMING(ths->timer->remap_nd[1]); /* twiddle outputs in order to get inputs shifted by n/2 */ PFFT_START_TIMING(ths->comm_cart, ths->timer->otwiddle); diff --git a/kernel/Makefile.am b/kernel/Makefile.am index 307e2b5..dc9588d 100644 --- a/kernel/Makefile.am +++ b/kernel/Makefile.am @@ -25,6 +25,7 @@ libkernel_la_SOURCES = \ partrafo.c \ sertrafo.c \ procmesh.c \ + remap.c \ remap_3dto2d.c \ timer.c \ transpose.c diff --git a/kernel/ipfft.h b/kernel/ipfft.h index 6d49503..1118244 100644 --- a/kernel/ipfft.h +++ b/kernel/ipfft.h @@ -213,7 +213,7 @@ typedef struct PX(timer_s) { double whole; double *trafo; double *remap; - double remap_3dto2d[2]; + double remap_nd[2]; double itwiddle; double otwiddle; } PX(timer_s); @@ -348,8 +348,8 @@ typedef struct{ sertrafo_plan local_transp[2]; int q0; int q1; -} remap_3dto2d_plan_s; -typedef remap_3dto2d_plan_s *remap_3dto2d_plan; +} remap_nd_plan_s; +typedef remap_nd_plan_s *remap_nd_plan; /* plan for parallel trafo */ typedef struct PX(plan_s){ @@ -385,7 +385,7 @@ typedef struct PX(plan_s){ gtransp_plan *global_remap; outrafo_plan *serial_trafo; - remap_3dto2d_plan remap_3dto2d[2]; + remap_nd_plan remap_nd[2]; /* save data pointers to input and output for for complex conjugation in the cases * PFFT_FORWARD & PFFTI_TRAFO_C2R and PFFT_BACKWARD & PFFTI_TRAFO_R2C */ @@ -586,31 +586,58 @@ MPI_Comm PX(assure_cart_comm)( void PX(split_cart_procmesh)( MPI_Comm comm_cart, MPI_Comm *comms_1d); -void PX(coords_3dto2d)( - int q0, int q1, const int *coords_3d, - int *coords_2d); -void PX(split_cart_procmesh_3dto2d_p0q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d); -void PX(split_cart_procmesh_3dto2d_p1q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d); -void PX(get_procmesh_dims_2d)( - MPI_Comm comm_cart_3d, - int *p0, int *p1, int *q0, int *q1); -void PX(split_cart_procmesh_for_3dto2d_remap_q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d); -void PX(split_cart_procmesh_for_3dto2d_remap_q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comms_1d); + int PX(get_mpi_cart_coord_1d)(MPI_Comm comm_cart_1d, int *coord); int PX(get_mpi_cart_coords)(MPI_Comm comm_cart, int maxdims, int *coords); int PX(get_mpi_cart_dims)(MPI_Comm comm_cart, int maxdims, int *dims); +/* remap.c */ +int PX(needs_remap_nd)( + int rnk_n, MPI_Comm comm_cart); + +void PX(local_block_remap_nd_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +int PX(local_size_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +remap_nd_plan PX(plan_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); +void PX(execute_remap_nd)( + remap_nd_plan ths, R *plannedin, R *plannedout, R *in, R *out); +void PX(remap_nd_rmplan)( + remap_nd_plan ths); + +void PX(remap_nd_split_cart_procmesh)( + const INT rnk_n, + MPI_Comm comm_cart, + MPI_Comm * comms_pm); + +void PX(remap_nd_get_coords)( + const INT rnk_n, + const INT pid, + MPI_Comm comm_cart, + int *np_pm, int *coords_pm); + +void PX(remap_nd_calculate_blocks)( + const INT rnk_n, + const INT *n, MPI_Comm comm_cart, + INT *iblk + ); + /* remap_3dto2d.c */ + void PX(local_block_remap_3dto2d_transposed)( int rnk_n, const INT *n, MPI_Comm comm_cart_3d, int pid, @@ -623,27 +650,36 @@ int PX(local_size_remap_3dto2d_transposed)( unsigned transp_flag, unsigned trafo_flag, INT *local_ni, INT *local_i_start, INT *local_no, INT *local_o_start); -remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( + +remap_nd_plan PX(plan_remap_3dto2d_transposed)( + remap_nd_plan ths, int rnk_n, const INT *n, INT howmany, MPI_Comm comm_cart_3d, R *in, R *out, unsigned transp_flag, unsigned trafo_flag, unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); -void PX(execute_remap_3dto2d)( - remap_3dto2d_plan ths, R *plannedin, R *plannedout, R *in, R *out); -void PX(remap_3dto2d_rmplan)( - remap_3dto2d_plan ths); + void PX(default_block_size_3dto2d)( const INT *n, int p0, int p1, int q0, int q1, INT *iblk, INT *mblk, INT *oblk); - - - - - - - - +void PX(coords_3dto2d)( + int q0, int q1, const int *coords_3d, + int *coords_2d); +void PX(split_cart_procmesh_3dto2d_p0q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_3dto2d_p1q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d); +void PX(get_procmesh_dims_2d)( + MPI_Comm comm_cart_3d, + int *p0, int *p1, int *q0, int *q1); +void PX(split_cart_procmesh_for_3dto2d_remap_q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_for_3dto2d_remap_q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comms_1d); /* timer.c */ diff --git a/kernel/partrafo.c b/kernel/partrafo.c index 47a07e9..6e2ddba 100644 --- a/kernel/partrafo.c +++ b/kernel/partrafo.c @@ -151,26 +151,26 @@ void PX(local_block_partrafo)( &lni_to, &lis_to, &lno_to, &los_to, &lni_ti, &lis_ti, &lno_ti, &los_ti); - /* overwrite input blocks if remap_3dto2d is used */ + /* overwrite input blocks if remap_nd is used */ if( ~transp_flag & PFFT_TRANSPOSED_IN ){ PX(local_block_partrafo_transposed)( rnk_n, pni_to, pno_to, iblk, mblk, rnk_pm, coords_pm, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], lni_to, lis_to, lno_to, los_to); - PX(local_block_remap_3dto2d_transposed)( + PX(local_block_remap_nd_transposed)( rnk_n, pni_to, comm_cart, pid, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], local_ni, local_i_start, dummy_ln, dummy_ls); } - /* overwrite input blocks if remap_3dto2d is used */ + /* overwrite input blocks if remap_nd is used */ if( ~transp_flag & PFFT_TRANSPOSED_OUT ){ PX(local_block_partrafo_transposed)( rnk_n, pni_ti, pno_ti, mblk, oblk, rnk_pm, coords_pm, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], lni_ti, lis_ti, lno_ti, los_ti); - PX(local_block_remap_3dto2d_transposed)( + PX(local_block_remap_nd_transposed)( rnk_n, pno_ti, comm_cart, pid, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], dummy_ln, dummy_ls, local_no, local_o_start); } @@ -264,7 +264,7 @@ INT PX(local_size_partrafo)( lni_to, lis_to, lno_to, los_to); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_3dto2d_transposed)( + mem_tmp = PX(local_size_remap_nd_transposed)( rnk_n, pni_to, howmany, comm_cart, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], lni_to, lis_to, dummy_ln, dummy_ls); mem = MAX(mem, mem_tmp); @@ -278,7 +278,7 @@ INT PX(local_size_partrafo)( lni_ti, lis_ti, lno_ti, los_ti); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_3dto2d_transposed)( + mem_tmp = PX(local_size_remap_nd_transposed)( rnk_n, pno_ti, howmany, comm_cart, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], dummy_ln, dummy_ls, lno_ti, los_ti); mem = MAX(mem, mem_tmp); @@ -458,17 +458,17 @@ PX(plan) PX(plan_partrafo)( /* plan with transposed output */ if(transp_flag & PFFT_TRANSPOSED_IN){ - ths->remap_3dto2d[0] = NULL; + ths->remap_nd[0] = NULL; set_plans_to_null(rnk_pm, PFFT_TRANSPOSED_OUT, ths->serial_trafo, ths->global_remap); } else { - ths->remap_3dto2d[0] = PX(plan_remap_3dto2d_transposed)( + ths->remap_nd[0] = PX(plan_remap_nd_transposed)( rnk_n, pni_to, howmany, comm_cart, in, out, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], opt_flag, io_flag, fftw_flags); - /* If remap_3dto2d exists, go on with in-place transforms in order to preserve input. */ - if( (ths->remap_3dto2d[0] != NULL) && (~io_flag & PFFT_DESTROY_INPUT) ) + /* If remap_nd exists, go on with in-place transforms in order to preserve input. */ + if( (ths->remap_nd[0] != NULL) && (~io_flag & PFFT_DESTROY_INPUT) ) in = out; PX(plan_partrafo_transposed)( @@ -486,7 +486,7 @@ PX(plan) PX(plan_partrafo)( if(transp_flag & PFFT_TRANSPOSED_OUT){ set_plans_to_null(rnk_pm, PFFT_TRANSPOSED_IN, ths->serial_trafo, ths->global_remap); - ths->remap_3dto2d[1] = NULL; + ths->remap_nd[1] = NULL; } else { PX(plan_partrafo_transposed)( rnk_n, pn_ti, pni_ti, pno_ti, howmany, mblk, oblk, @@ -498,7 +498,7 @@ PX(plan) PX(plan_partrafo)( if( ~io_flag & PFFT_DESTROY_INPUT ) in = out; - ths->remap_3dto2d[1] = PX(plan_remap_3dto2d_transposed)( + ths->remap_nd[1] = PX(plan_remap_nd_transposed)( rnk_n, pno_ti, howmany, comm_cart, out, in, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], opt_flag, io_flag, fftw_flags); @@ -535,18 +535,16 @@ static void malloc_and_split_cart_procmesh( { MPI_Cartdim_get(comm_cart, rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ) - *rnk_pm = 2; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) + *rnk_pm = rnk_n - 1; *comms_pm = (MPI_Comm*) malloc(sizeof(MPI_Comm) * (size_t) *rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ - PX(split_cart_procmesh_3dto2d_p0q0)(comm_cart, - *comms_pm + 0); - PX(split_cart_procmesh_3dto2d_p1q1)(comm_cart, - *comms_pm + 1); - } else + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + PX(remap_nd_split_cart_procmesh)(rnk_n, comm_cart, *comms_pm); + } else { PX(split_cart_procmesh)(comm_cart, *comms_pm); + } } static void malloc_and_compute_cart_np_and_coords( @@ -557,18 +555,14 @@ static void malloc_and_compute_cart_np_and_coords( { MPI_Cartdim_get(comm_cart, rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ) - *rnk_pm = 2; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) + *rnk_pm = rnk_n - 1; *np_pm = PX(malloc_int)(*rnk_pm); *coords_pm = PX(malloc_int)(*rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ - int p0, p1, q0, q1, coords_3d[3]; - PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); - MPI_Cart_coords(comm_cart, pid, 3, coords_3d); - PX(coords_3dto2d)(q0, q1, coords_3d, *coords_pm); - (*np_pm)[0] = p0*q0; (*np_pm)[1] = p1*q1; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ){ + PX(remap_nd_get_coords)(rnk_n, pid, comm_cart, *coords_pm, *np_pm); } else { int *periods = PX(malloc_int)(*rnk_pm); MPI_Cart_get(comm_cart, *rnk_pm, *np_pm, periods, *coords_pm); @@ -579,8 +573,6 @@ static void malloc_and_compute_cart_np_and_coords( - - static void save_param_into_plan( int rnk_n, const INT *n, const INT *ni, const INT *no, INT howmany, const INT *iblock, const INT *mblock, const INT *oblock, @@ -930,7 +922,7 @@ void PX(rmplan)( free(ths->skip_trafos); for(int t=0; t<2; t++) - PX(remap_3dto2d_rmplan)(ths->remap_3dto2d[t]); + PX(remap_nd_rmplan)(ths->remap_nd[t]); PX(destroy_timer)(ths->timer); diff --git a/kernel/procmesh.c b/kernel/procmesh.c index bb0d2bf..14660f3 100644 --- a/kernel/procmesh.c +++ b/kernel/procmesh.c @@ -25,13 +25,6 @@ static void extract_comm_1d( int dim, MPI_Comm comm_cart, MPI_Comm *comm_1d); -static void factorize( - int q, - int *ptr_q0, int *ptr_q1); -static void factorize_equal( - int p0, int p1, int q, - int *ptr_q0, int *ptr_q1); - int PX(create_procmesh)( int rnk, MPI_Comm comm, const int *np, @@ -188,205 +181,3 @@ int PX(get_mpi_cart_dims)(MPI_Comm comm_cart, int maxdims, int *dims) return ret; } -void PX(coords_3dto2d)( - int q0, int q1, const int *coords_3d, - int *coords_2d - ) -{ - coords_2d[0] = coords_3d[0]*q0 + coords_3d[2]/q1; - coords_2d[1] = coords_3d[1]*q1 + coords_3d[2]%q1; -} - -void PX(split_cart_procmesh_3dto2d_p0q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p1*q1 comms of size p0*q0 */ - key = coords_3d[0]*q0 + coords_3d[2]/q1; - color = coords_3d[1]*q1 + coords_3d[2]%q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = p0*q0; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -void PX(split_cart_procmesh_3dto2d_p1q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*q0 comms of size p1*q1 */ - color = coords_3d[0]*q0 + coords_3d[2]/q1; - key = coords_3d[1]*q1 + coords_3d[2]%q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = p1*q1; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -/* implement the splitting to create p0*p1*q0 comms of size q1 - * and p0*p1*q1 comms of size q0 */ -void PX(split_cart_procmesh_for_3dto2d_remap_q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*p1*q1 comms of size q0 */ - color = coords_3d[0]*p1*q1 + coords_3d[1]*q1 + coords_3d[2]%q1; - key = coords_3d[2]/q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = q0; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -void PX(split_cart_procmesh_for_3dto2d_remap_q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*p1*q0 comms of size q1 */ - color = coords_3d[0]*p1*q0 + coords_3d[1]*q0 + coords_3d[2]/q1; - key = coords_3d[2]%q1; -// key = coords_3d[2]/q0; /* TODO: delete this line after several tests */ - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = q1; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - -void PX(get_procmesh_dims_2d)( - MPI_Comm comm_cart_3d, - int *p0, int *p1, int *q0, int *q1 - ) -{ - int ndims=3, dims[3]; - - PX(get_mpi_cart_dims)(comm_cart_3d, ndims, dims); - *p0 = dims[0]; *p1 = dims[1]; -// factorize(dims[2], q0, q1); - factorize_equal(dims[0], dims[1], dims[2], q0, q1); -} - -/* factorize an integer q into q0*q1 with - * q1 <= q0 and q0-q1 -> min */ -static void factorize( - int q, - int *ptr_q0, int *ptr_q1 - ) -{ - for(int t = 1; t <= sqrt(q); t++) - if(t * (q/t) == q) - *ptr_q1 = t; - - *ptr_q0 = q / (*ptr_q1); -} - -/* factorize an integer q into q0*q1 with - * abs(p0*q0 - p1*q1) -> min */ -static void factorize_equal( - int p0, int p1, int q, - int *ptr_q0, int *ptr_q1 - ) -{ - int q0, q1; - int opt_q0 = 1; - int opt_q1 = q; - R min_err = pfft_fabs(p0 * q - p1 * 1.0); - - for(q1 = 1; q1 <= sqrt(q); q1++){ - q0 = q/q1; - if(q0*q1 == q){ - R err = pfft_fabs(p0*q0 - p1*q1); - if(err < min_err){ - min_err = err; - opt_q0 = q0; - opt_q1 = q1; - } - } - } - - *ptr_q0 = opt_q0; - *ptr_q1 = opt_q1; -} - diff --git a/kernel/remap.c b/kernel/remap.c new file mode 100644 index 0000000..53e6bb1 --- /dev/null +++ b/kernel/remap.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2011-2013 Michael Pippig + * + * This file is part of PFFT. + * + * PFFT is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PFFT is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with PFFT. If not, see . + * + */ + + +#include "pfft.h" +#include "ipfft.h" +#include "util.h" + +/* Global infos about procmesh are only enabled in debug mode + * Otherwise we do not use any global variables. */ +#if PFFT_DEBUG_GVARS + extern MPI_Comm *gdbg_comm_cart; + extern int gdbg_rnk_pm; + extern MPI_Comm *gdbg_comms_pm; +#endif + +static remap_nd_plan remap_nd_mkplan(void); + +int PX(needs_remap_nd)( + int rnk_n, MPI_Comm comm_cart + ) +{ + int rnk_pm; + + MPI_Cartdim_get(comm_cart, &rnk_pm); + + return (rnk_n == rnk_pm); +} + +void PX(local_block_remap_nd_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start) { + + if(rnk_n == 3) { + PX(local_block_remap_3dto2d_transposed)( + rnk_n, n, + comm_cart, pid, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + return; + } + abort(); +} + +int PX(local_size_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start) { + if(rnk_n == 3) { + return + PX(local_size_remap_3dto2d_transposed)( + rnk_n, n, howmany, + comm_cart, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + } + abort(); +} + +remap_nd_plan PX(plan_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags) +{ + if(rnk_n == 3) { + remap_nd_plan ths = remap_nd_mkplan(); + + return + PX(plan_remap_3dto2d_transposed)( + ths, + rnk_n, n, howmany, + comm_cart, in, out, + transp_flag, trafo_flag, + opt_flag, io_flag, fftw_flags); + } + abort(); +} + +void PX(execute_remap_nd)( + remap_nd_plan ths, R *plannedin, R *plannedout, R *in, R *out) +{ + if(ths==NULL) + return; + +// #if PFFT_DEBUG_GVARS +// int np, myrank; +// int np0, np1, rnk0, rnk1; +// MPI_Comm_size(*gdbg_comm_cart, &np); +// MPI_Comm_rank(*gdbg_comm_cart, &myrank); +// MPI_Comm_size(gdbg_comms_pm[0], &np0); +// MPI_Comm_size(gdbg_comms_pm[1], &np1); +// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0); +// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1); +// +// int dims[3], periods[3], coords[3]; +// MPI_Cart_get(*gdbg_comm_cart, 3, +// dims, periods, coords); +// +// INT local_N[3], local_N_start[3]; +// +// int p0, p1, q0, q1; +// p0 = dims[0]; p1 = dims[1]; +// q0 = np0/p0; q1 = np1/p1; +// +// int lerr, m; +// #endif + + /* execute all initialized plans */ + PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0; local_N_start[0] = 0; +// local_N[1] = 512/p1; local_N_start[1] = 0; +// local_N[2] = 512/q0/q1; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! Before 1st remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[0]->dbg->in[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0; local_N_start[0] = 0; +// local_N[1] = 512/p1/q1; local_N_start[1] = 0; +// local_N[2] = 512/q0; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! Before 2nd remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->in[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0/q0; local_N_start[0] = 0; +// local_N[1] = 512/p1/q1; local_N_start[1] = 0; +// local_N[2] = 512; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! After 2nd remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->out[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); + +} + +static remap_nd_plan remap_nd_mkplan( + void + ) +{ + remap_nd_plan ths = (remap_nd_plan) malloc(sizeof(remap_nd_plan_s)); + + /* initialize to NULL for easy checks */ + for(int t=0; t<2; t++){ + ths->local_transp[t] = NULL; + ths->global_remap[t] = NULL; + } + + return ths; +} + + +void PX(remap_nd_rmplan)( + remap_nd_plan ths + ) +{ + /* plan was already destroyed or never initialized */ + if(ths==NULL) + return; + + for(int t=0; t<2; t++){ + PX(sertrafo_rmplan)(ths->local_transp[t]); + PX(gtransp_rmplan)(ths->global_remap[t]); + } + + /* free memory */ + free(ths); + /* ths=NULL; would be senseless, since we can not change the pointer itself */ +} + + +void PX(remap_nd_split_cart_procmesh)( + const INT rnk_n, + MPI_Comm comm_cart, + MPI_Comm * comms_pm) +{ + if(rnk_n == 3) { + PX(split_cart_procmesh_3dto2d_p0q0)(comm_cart, + comms_pm + 0); + PX(split_cart_procmesh_3dto2d_p1q1)(comm_cart, + comms_pm + 1); + return; + } + abort(); +} + +void PX(remap_nd_get_coords)( + const INT rnk_n, + const INT pid, + MPI_Comm comm_cart, + int *np_pm, int *coords_pm) +{ + if(rnk_n == 3) { + int p0, p1, q0, q1, coords_3d[3]; + PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); + MPI_Cart_coords(comm_cart, pid, 3, coords_3d); + PX(coords_3dto2d)(q0, q1, coords_3d, coords_pm); + (np_pm)[0] = p0*q0; (np_pm)[1] = p1*q1; + return; + } + abort(); +} + +void PX(remap_nd_calculate_blocks)( + const INT rnk_n, + const INT *n, MPI_Comm comm_cart, + INT *iblk + ) +{ + if(rnk_n == 3) { + int q0, q1, p0, p1; + INT mblk[3], oblk[3]; + + PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); + PX(default_block_size_3dto2d)(n, p0, p1, q0, q1, + iblk, mblk, oblk); + return; + } + abort(); +} + diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index c6607e4..59c524d 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -31,6 +31,13 @@ extern MPI_Comm *gdbg_comms_pm; #endif +static void factorize( + int q, + int *ptr_q0, int *ptr_q1); +static void factorize_equal( + int p0, int p1, int q, + int *ptr_q0, int *ptr_q1); + /* TODO: think about order of in and out for T_IN */ /* TODO: implement user blocksize */ @@ -68,10 +75,6 @@ static void get_local_start_3d_by_coords( const INT *n, const INT *blks, const int *coords, INT *local_start); -static remap_3dto2d_plan remap_3dto2d_mkplan( - void); - - void PX(local_block_remap_3dto2d_transposed)( int rnk_n, const INT *pn, MPI_Comm comm_cart_3d, int pid, @@ -216,7 +219,8 @@ int PX(local_size_remap_3dto2d_transposed)( /* ouput is written to 'in', also for outofplace */ -remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( +remap_nd_plan PX(plan_remap_3dto2d_transposed)( + remap_nd_plan ths, int rnk_n, const INT *pn, INT howmany, MPI_Comm comm_cart_3d, R *in_user, R *out_user, unsigned transp_flag, unsigned trafo_flag, @@ -230,7 +234,6 @@ remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( MPI_Comm icomms[3], mcomms[3], ocomms[3]; MPI_Comm comm_q0, comm_q1; R *in=in_user, *out=out_user; - remap_3dto2d_plan ths; /* remap only works for 3d data on 3d procmesh */ if(rnk_n != 3) @@ -240,8 +243,6 @@ remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( if(rnk_pm != 3) return NULL; - ths = remap_3dto2d_mkplan(); - /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ if(trafo_flag & PFFTI_TRAFO_RDFT) trafo_flag = PFFTI_TRAFO_R2R; @@ -481,165 +482,206 @@ static void split_comms_3dto2d( PX(split_cart_procmesh_for_3dto2d_remap_q0)(comm_cart_3d, &mcomms[2]); } +void PX(coords_3dto2d)( + int q0, int q1, const int *coords_3d, + int *coords_2d + ) +{ + coords_2d[0] = coords_3d[0]*q0 + coords_3d[2]/q1; + coords_2d[1] = coords_3d[1]*q1 + coords_3d[2]%q1; +} +void PX(split_cart_procmesh_3dto2d_p0q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d + ) +{ + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + /* split into p1*q1 comms of size p0*q0 */ + key = coords_3d[0]*q0 + coords_3d[2]/q1; + color = coords_3d[1]*q1 + coords_3d[2]%q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + dim_1d = p0*q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + MPI_Comm_free(&comm); +} -void PX(execute_remap_3dto2d)( - remap_3dto2d_plan ths, R * plannedin, R * plannedout, R * in, R * out +void PX(split_cart_procmesh_3dto2d_p1q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - if(ths==NULL) + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_3d) ) return; -// #if PFFT_DEBUG_GVARS -// int np, myrank; -// int np0, np1, rnk0, rnk1; -// MPI_Comm_size(*gdbg_comm_cart, &np); -// MPI_Comm_rank(*gdbg_comm_cart, &myrank); -// MPI_Comm_size(gdbg_comms_pm[0], &np0); -// MPI_Comm_size(gdbg_comms_pm[1], &np1); -// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0); -// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1); -// -// int dims[3], periods[3], coords[3]; -// MPI_Cart_get(*gdbg_comm_cart, 3, -// dims, periods, coords); -// -// INT local_N[3], local_N_start[3]; -// -// int p0, p1, q0, q1; -// p0 = dims[0]; p1 = dims[1]; -// q0 = np0/p0; q1 = np1/p1; -// -// int lerr, m; -// #endif - - /* execute all initialized plans */ - PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0; local_N_start[0] = 0; -// local_N[1] = 512/p1; local_N_start[1] = 0; -// local_N[2] = 512/q0/q1; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! Before 1st remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[0]->dbg->in[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0; local_N_start[0] = 0; -// local_N[1] = 512/p1/q1; local_N_start[1] = 0; -// local_N[2] = 512/q0; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! Before 2nd remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->in[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0/q0; local_N_start[0] = 0; -// local_N[1] = 512/p1/q1; local_N_start[1] = 0; -// local_N[2] = 512; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! After 2nd remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->out[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*q0 comms of size p1*q1 */ + color = coords_3d[0]*q0 + coords_3d[2]/q1; + key = coords_3d[1]*q1 + coords_3d[2]%q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = p1*q1; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); } -static remap_3dto2d_plan remap_3dto2d_mkplan( - void + +/* implement the splitting to create p0*p1*q0 comms of size q1 + * and p0*p1*q1 comms of size q0 */ +void PX(split_cart_procmesh_for_3dto2d_remap_q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - remap_3dto2d_plan ths = (remap_3dto2d_plan) malloc(sizeof(remap_3dto2d_plan_s)); + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; - /* initialize to NULL for easy checks */ - for(int t=0; t<2; t++){ - ths->local_transp[t] = NULL; - ths->global_remap[t] = NULL; - } - - return ths; + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*p1*q1 comms of size q0 */ + color = coords_3d[0]*p1*q1 + coords_3d[1]*q1 + coords_3d[2]%q1; + key = coords_3d[2]/q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); } -void PX(remap_3dto2d_rmplan)( - remap_3dto2d_plan ths +void PX(split_cart_procmesh_for_3dto2d_remap_q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - /* plan was already destroyed or never initialized */ - if(ths==NULL) + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) return; - for(int t=0; t<2; t++){ - PX(sertrafo_rmplan)(ths->local_transp[t]); - PX(gtransp_rmplan)(ths->global_remap[t]); + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*p1*q0 comms of size q1 */ + color = coords_3d[0]*p1*q0 + coords_3d[1]*q0 + coords_3d[2]/q1; + key = coords_3d[2]%q1; +// key = coords_3d[2]/q0; /* TODO: delete this line after several tests */ + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = q1; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + +void PX(get_procmesh_dims_2d)( + MPI_Comm comm_cart_3d, + int *p0, int *p1, int *q0, int *q1 + ) +{ + int ndims=3, dims[3]; + + PX(get_mpi_cart_dims)(comm_cart_3d, ndims, dims); + *p0 = dims[0]; *p1 = dims[1]; +// factorize(dims[2], q0, q1); + factorize_equal(dims[0], dims[1], dims[2], q0, q1); +} + +/* factorize an integer q into q0*q1 with + * q1 <= q0 and q0-q1 -> min */ +static void factorize( + int q, + int *ptr_q0, int *ptr_q1 + ) +{ + for(int t = 1; t <= sqrt(q); t++) + if(t * (q/t) == q) + *ptr_q1 = t; + + *ptr_q0 = q / (*ptr_q1); +} + +/* factorize an integer q into q0*q1 with + * abs(p0*q0 - p1*q1) -> min */ +static void factorize_equal( + int p0, int p1, int q, + int *ptr_q0, int *ptr_q1 + ) +{ + int q0, q1; + int opt_q0 = 1; + int opt_q1 = q; + R min_err = pfft_fabs(p0 * q - p1 * 1.0); + + for(q1 = 1; q1 <= sqrt(q); q1++){ + q0 = q/q1; + if(q0*q1 == q){ + R err = pfft_fabs(p0*q0 - p1*q1); + if(err < min_err){ + min_err = err; + opt_q0 = q0; + opt_q1 = q1; + } + } } - /* free memory */ - free(ths); - /* ths=NULL; would be senseless, since we can not change the pointer itself */ + *ptr_q0 = opt_q0; + *ptr_q1 = opt_q1; } diff --git a/kernel/timer.c b/kernel/timer.c index 0c47e2b..374076d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -232,7 +232,7 @@ PX(timer) PX(copy_timer)( for(int t=0; trnk_remap; t++) copy->remap[t] = orig->remap[t]; for(int t=0; t<2; t++) - copy->remap_3dto2d[t] = orig->remap_3dto2d[t]; + copy->remap_nd[t] = orig->remap_nd[t]; copy->itwiddle = orig->itwiddle; copy->otwiddle = orig->otwiddle; @@ -252,7 +252,7 @@ void PX(average_timer)( for(int t=0; trnk_remap; t++) ths->remap[t] /= ths->iter; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] /= ths->iter; + ths->remap_nd[t] /= ths->iter; ths->itwiddle /= ths->iter; ths->otwiddle /= ths->iter; @@ -272,7 +272,7 @@ PX(timer) PX(add_timers)( for(int t=0; trnk_remap; t++) res->remap[t] += sum2->remap[t]; for(int t=0; t<2; t++) - res->remap_3dto2d[t] += sum2->remap_3dto2d[t]; + res->remap_nd[t] += sum2->remap_nd[t]; res->itwiddle += sum2->itwiddle; res->otwiddle += sum1->otwiddle; @@ -311,7 +311,7 @@ double* PX(convert_timer2vec)( for(int t=0; trnk_remap; t++) times[m++] = ths->remap[t]; for(int t=0; t<2; t++) - times[m++] = ths->remap_3dto2d[t]; + times[m++] = ths->remap_nd[t]; times[m++] = ths->itwiddle; times[m++] = ths->otwiddle; @@ -332,7 +332,7 @@ PX(timer) PX(convert_vec2timer)( for(int t=0; trnk_remap; t++) ths->remap[t] = times[m++]; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] = times[m++]; + ths->remap_nd[t] = times[m++]; ths->itwiddle = times[m++]; ths->otwiddle = times[m++]; @@ -350,7 +350,7 @@ static void reset_timer( for(int t=0; trnk_remap; t++) ths->remap[t] = 0; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] = 0; + ths->remap_nd[t] = 0; ths->itwiddle = 0; ths->otwiddle = 0; } @@ -379,7 +379,7 @@ static size_t length( /* +3 for rnk_pm, rnk_trafo, rnk_remap */ /* +1 for number of iterations */ /* +1 for whole trafo timer */ - /* +2 for remap_3dto2d[2] */ + /* +2 for remap_nd[2] */ /* +2 for itwiddle, otwiddle */ return (size_t) (ths->rnk_trafo + ths->rnk_remap + 9); } @@ -416,8 +416,8 @@ static void fprint_average_timer_prefixed( /* print times of transposed out step */ PX(fprintf)(comm, file, "%s_itwiddle(%d) = %.3e;\n", prefix, idx, mt->itwiddle); - PX(fprintf)(comm, file, "%s_remap_3dto2d(%d, 2) = %.3e;\n", - prefix, idx, mt->remap_3dto2d[0]); + PX(fprintf)(comm, file, "%s_remap_nd(%d, 2) = %.3e;\n", + prefix, idx, mt->remap_nd[0]); for(int t=0; trnk_pm; t++, k++, l++){ PX(fprintf)(comm, file, "%s_trafo%d(%d, 2) = %.3e; ", prefix, k+1, idx, mt->trafo[k]); @@ -438,7 +438,7 @@ static void fprint_average_timer_prefixed( PX(fprintf)(comm, file, "%s_trafo%d(%d, 2) = %.3e;\n", prefix, k+1, idx, mt->trafo[k]); PX(fprintf)(comm, file, "%s_remap_2dto3d(%d, 2) = %.3e;\n", - prefix, idx, mt->remap_3dto2d[1]); + prefix, idx, mt->remap_nd[1]); PX(fprintf)(comm, file, "%s_otwiddle(%d) = %.3e;\n", prefix, idx, mt->otwiddle); } diff --git a/util/util.c b/util/util.c index 924a16e..d0f25bd 100644 --- a/util/util.c +++ b/util/util.c @@ -248,14 +248,3 @@ unsigned* PX(malloc_unsigned)( return (unsigned*) malloc(sizeof(unsigned) * size); } -int PX(needs_3dto2d_remap)( - int rnk_n, MPI_Comm comm_cart - ) -{ - int rnk_pm; - - MPI_Cartdim_get(comm_cart, &rnk_pm); - - return (rnk_n == 3) && (rnk_pm == 3); -} - diff --git a/util/util.h b/util/util.h index 0a95d7d..4ab38eb 100644 --- a/util/util.h +++ b/util/util.h @@ -61,8 +61,5 @@ int* PX(malloc_int)( unsigned* PX(malloc_unsigned)( size_t size); -int PX(needs_3dto2d_remap)( - int rnk_n, MPI_Comm comm_cart); - #endif /* !PFFT_UTIL_H */ From 9eb305426e8a71f8035e21d4b540e1da72aceb61 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 15:20:25 -0800 Subject: [PATCH 12/35] Add some comments in the code. --- kernel/remap_3dto2d.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index 59c524d..36a826c 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -23,8 +23,18 @@ #include "ipfft.h" #include "util.h" +/* + * useful table of i, o, and m. + * + * i : n0 / p0 x n1 / p1 x n2 / p2 + * o : n0 / (p0 * q0) x n1 / (p1 * q1) x n2 / 1 + * m : n0 / p0 x n1 / (p1 * q1) x n2 / q0 + * + * */ + /* Global infos about procmesh are only enabled in debug mode * Otherwise we do not use any global variables. */ + #if PFFT_DEBUG_GVARS extern MPI_Comm *gdbg_comm_cart; extern int gdbg_rnk_pm; @@ -274,6 +284,15 @@ remap_nd_plan PX(plan_remap_3dto2d_transposed)( } /* n2/(q0*q1) x n0/p0 x n1/p1 -> n2/q0 x n0/p0 x n1/(p1*q1) */ + /* for each q0, we are looking at a transpose of + * local_ni[1] x (local_nm[2] x local_ni[0]), + * The intial partition is along (local_nm[2] x local_ni[0]), + * by size iblk[2] x local_ni[0]. + * The final partition is along local_ni[1], by size + * mblk[1]. + * + * The math works out by referring to the table at the beginning of the code. + * */ N0 = local_ni[1]; h0 = 1; N1 = local_nm[2]; h1 = local_ni[0]; blk0 = mblk[1]; From 309fc9677aaba11d2b40132760c719c2c7e7abc1 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 16:40:48 -0800 Subject: [PATCH 13/35] add a 2dto1d remapping that compiles. --- kernel/Makefile.am | 1 + kernel/ipfft.h | 45 ++++ kernel/remap.c | 10 + kernel/remap_2dto1d.c | 537 ++++++++++++++++++++++++++++++++++++++++++ kernel/remap_3dto2d.c | 2 +- 5 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 kernel/remap_2dto1d.c diff --git a/kernel/Makefile.am b/kernel/Makefile.am index dc9588d..265dd46 100644 --- a/kernel/Makefile.am +++ b/kernel/Makefile.am @@ -27,5 +27,6 @@ libkernel_la_SOURCES = \ procmesh.c \ remap.c \ remap_3dto2d.c \ + remap_2dto1d.c \ timer.c \ transpose.c diff --git a/kernel/ipfft.h b/kernel/ipfft.h index 1118244..c24d69c 100644 --- a/kernel/ipfft.h +++ b/kernel/ipfft.h @@ -681,6 +681,51 @@ void PX(split_cart_procmesh_for_3dto2d_remap_q1)( MPI_Comm comm_cart_3d, MPI_Comm *comms_1d); +/* remap_2dto1d.c */ + +void PX(local_block_remap_2dto1d_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart_3d, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +int PX(local_size_remap_2dto1d_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart_3d, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); + +remap_nd_plan PX(plan_remap_2dto1d_transposed)( + remap_nd_plan ths, + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart_3d, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); + +void PX(default_block_size_2dto1d)( + const INT *n, int p0, int q0, + INT *iblk, INT *oblk); + +void PX(coords_2dto1d)( + int q0, const int *coords_2d, + int *coords_1d); +void PX(split_cart_procmesh_2dto1d_p0q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_2dto1d_p1q1)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(get_procmesh_dims_1d)( + MPI_Comm comm_cart_2d, + int *p0, int *q0); +void PX(split_cart_procmesh_for_2dto1d_remap_q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_for_2dto1d_remap_q1)( + MPI_Comm comm_cart_2d, + MPI_Comm *comms_1d); + /* timer.c */ PX(timer) PX(mktimer)( diff --git a/kernel/remap.c b/kernel/remap.c index 53e6bb1..45d00e8 100644 --- a/kernel/remap.c +++ b/kernel/remap.c @@ -60,6 +60,15 @@ void PX(local_block_remap_nd_transposed)( local_no, local_o_start); return; } + if(rnk_n == 2) { + PX(local_block_remap_2dto1d_transposed)( + rnk_n, n, + comm_cart, pid, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + return; + } abort(); } @@ -286,6 +295,7 @@ void PX(remap_nd_get_coords)( abort(); } +/* used in API as a short cut to pin the block size; notice that only iblk is needed. */ void PX(remap_nd_calculate_blocks)( const INT rnk_n, const INT *n, MPI_Comm comm_cart, diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c new file mode 100644 index 0000000..873f0be --- /dev/null +++ b/kernel/remap_2dto1d.c @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2011-2013 Michael Pippig + * + * This file is part of PFFT. + * + * PFFT is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PFFT is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with PFFT. If not, see . + * + */ + + +#include "pfft.h" +#include "ipfft.h" +#include "util.h" + +/* Global infos about procmesh are only enabled in debug mode + * Otherwise we do not use any global variables. */ +#if PFFT_DEBUG_GVARS + extern MPI_Comm *gdbg_comm_cart; + extern int gdbg_rnk_pm; + extern MPI_Comm *gdbg_comms_pm; +#endif + +static void factorize( + int q, + int *ptr_q0, int *ptr_q1); +static void factorize_equal( + int p0, int p1, int q, + int *ptr_q0, int *ptr_q1); + + +/* TODO: think about order of in and out for T_IN */ +/* TODO: implement user blocksize */ + +static void init_blks_comms_local_size( + const INT *n, MPI_Comm comm_cart_2d, + INT *iblk, INT *oblk, + MPI_Comm *icomms, MPI_Comm *ocomms, + INT *local_ni, INT *local_no); + +static void split_comms_2dto1d( + MPI_Comm comm_cart_2d, + MPI_Comm *icomms, MPI_Comm *ocomms); +static void get_local_blocks_by_comms( + const INT *n, + const INT *iblks, const MPI_Comm *icomms, + const INT *oblks, const MPI_Comm *ocomms, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +static void get_local_n_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_n); +static void get_local_start_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_start); +static void get_local_blocks_by_coords( + const INT *n, + const INT *iblks, const int *icoords, + const INT *oblks, const int *ocoords, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +static void get_local_n_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_n); +static void get_local_start_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_start); + +void PX(local_block_remap_2dto1d_transposed)( + int rnk_n, const INT *pn, + MPI_Comm comm_cart_2d, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + int p0, q0, rnk_pm; + int icoords[2], ocoords[2]; + INT iblks[2], oblks[2]; + + /* remap only works for 3d data on 3d procmesh */ + if(rnk_n != 2) return; + + MPI_Cartdim_get(comm_cart_2d, &rnk_pm); + if(rnk_pm != 2) return; + + /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ + if(trafo_flag & PFFTI_TRAFO_RDFT) + trafo_flag = PFFTI_TRAFO_R2R; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + PX(default_block_size_2dto1d)(pn, p0, q0, iblks, oblks); + + MPI_Cart_coords(comm_cart_2d, pid, 2, icoords); + PX(coords_2dto1d)(q0, icoords, ocoords); + ocoords[1] = 0; + + /* take care of transposed data ordering */ + if(transp_flag & PFFT_TRANSPOSED_OUT){ + get_local_blocks_by_coords(pn, iblks, icoords, oblks, ocoords, + local_ni, local_i_start, local_no, local_o_start); + } else { + get_local_blocks_by_coords(pn, iblks, icoords, oblks, ocoords, + local_no, local_o_start, local_ni, local_i_start); + } +} + +static void free_three_comms(MPI_Comm *comms) +{ + const int num_comms = 2; + for(int t=0; t n1/p1 x n0/p0 */ + nb = local_ni[0]; + nt = local_ni[1]; + + mem_tmp = PX(local_size_sertrafo)( + nb, 1, &nt, howmany, trafo_flag); + mem = MAX(mem, mem_tmp); + + /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) + * for each q0 ranks, a transpose of a matrix n1 x n0/p0, + * from divided along n1, to along n0/p0 + * + * */ + N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ + N1 = local_no[1]; h1 = 1; /* n1 */ + blk0 = oblk[1]; /* n0/(p0*q0) */ + blk1 = iblk[1]; /* n1 / q0 */ + hm = 1; /* set hm to 1 since mem will be in units of real/complex */ + + PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); + mem_tmp = PX(local_size_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, comm_q0); + mem = MAX(mem, mem_tmp); + MPI_Comm_free(&comm_q0); + + /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ + nb = local_no[1]; + nt = local_no[0]; + + mem_tmp = PX(local_size_sertrafo)( + nb, 1, &nt, howmany, trafo_flag); + mem = MAX(mem, mem_tmp); + + /* take care of transposed data ordering */ + if(transp_flag & PFFT_TRANSPOSED_OUT){ + get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, + local_ni, local_i_start, local_no, local_o_start); + } else { + get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, + local_no, local_o_start, local_ni, local_i_start); + } + + /* free communicators */ + free_three_comms(icomms); + free_three_comms(ocomms); + + return mem; +} + + +/* ouput is written to 'in', also for outofplace */ +remap_nd_plan PX(plan_remap_2dto1d_transposed)( + remap_nd_plan ths, + int rnk_n, const INT *pn, INT howmany, + MPI_Comm comm_cart_2d, R *in_user, R *out_user, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags + ) +{ + int p0, p1, rnk_pm; + INT nb, nt, N0, N1, h0, h1, hm, blk0, blk1; + INT local_ni[3], local_no[3]; + INT iblk[3],oblk[3]; + MPI_Comm icomms[3], ocomms[3]; + MPI_Comm comm_q0, comm_q1; + R *in=in_user, *out=out_user; + + /* remap only works for 2d data on 2d procmesh */ + if(rnk_n != 2) + return NULL; + + MPI_Cartdim_get(comm_cart_2d, &rnk_pm); + if(rnk_pm != 2) + return NULL; + + /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ + if(trafo_flag & PFFTI_TRAFO_RDFT) + trafo_flag = PFFTI_TRAFO_R2R; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &ths->q0); + init_blks_comms_local_size(pn, comm_cart_2d, + iblk, oblk, icomms, ocomms, + local_ni, local_no); + + /* n0/p0 x n1/p1 > n1/q0 x n0/p0 */ + nb = local_ni[0]; + nt = local_ni[1]; + + /* plan TRANSPOSED_IN in opposite direction */ + if(transp_flag & PFFT_TRANSPOSED_IN){ + if(~io_flag & PFFT_DESTROY_INPUT) + in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT) + in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ + } + + if(transp_flag & PFFT_TRANSPOSED_IN) + ths->global_remap[1] = NULL; + else + ths->global_remap[0] = NULL; + + /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) */ + N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ + N1 = local_no[1]; h1 = 1; /* n1 */ + blk0 = oblk[1]; /* n0/(p0*q0) */ + blk1 = iblk[1]; /* n1 / q0 */ + hm = howmany * (trafo_flag & PFFTI_TRAFO_C2C ? 2 : 1); + + PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); + if(transp_flag & PFFT_TRANSPOSED_IN) + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, in, PFFT_TRANSPOSED_OUT, fftw_flags); + else + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, in, out, PFFT_TRANSPOSED_IN, fftw_flags); + MPI_Comm_free(&comm_q0); + + /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ + nb = local_no[1]; + nt = local_no[0]; + + if(transp_flag & PFFT_TRANSPOSED_IN){ + if(~io_flag & PFFT_DESTROY_INPUT){ + /* restore pointers to 'in_user' and 'out_user' in order to compute the first step out-of-place */ + in = in_user; + } + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } + + /* free communicators */ + free_three_comms(icomms); + free_three_comms(ocomms); + + return ths; +} + +static void init_blks_comms_local_size( + const INT *n, MPI_Comm comm_cart_2d, + INT *iblk, INT *oblk, + MPI_Comm *icomms, MPI_Comm *ocomms, + INT *local_ni, INT *local_no + ) +{ + int p0, p1, q0, q1; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + PX(default_block_size_2dto1d)(n, p0, q0, + iblk, oblk); + split_comms_2dto1d(comm_cart_2d, + icomms, ocomms); + get_local_n_2d_by_comms(n, iblk, icomms, + local_ni); + get_local_n_2d_by_comms(n, oblk, ocomms, + local_no); +} + +static void get_local_blocks_by_comms( + const INT *n, + const INT *iblks, const MPI_Comm *icomms, + const INT *oblks, const MPI_Comm *ocomms, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + get_local_n_2d_by_comms(n, iblks, icomms, local_ni); + get_local_start_2d_by_comms(n, iblks, icomms, local_i_start); + get_local_n_2d_by_comms(n, oblks, ocomms, local_no); + get_local_start_2d_by_comms(n, oblks, ocomms, local_o_start); +} + +static void get_local_n_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_n + ) +{ + int coord; + + for(int t=0; t<2; t++){ + PX(get_mpi_cart_coord_1d)(comms[t], &coord); + local_n[t] = PX(local_block_size)(n[t], blks[t], coord); + } +} + +static void get_local_start_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_start + ) +{ + int coord; + + for(int t=0; t<2; t++){ + PX(get_mpi_cart_coord_1d)(comms[t], &coord); + local_start[t] = PX(local_block_offset)(n[t], blks[t], coord); + } +} + +static void get_local_blocks_by_coords( + const INT *n, + const INT *iblks, const int *icoords, + const INT *oblks, const int *ocoords, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + get_local_n_2d_by_coords(n, iblks, icoords, local_ni); + get_local_start_2d_by_coords(n, iblks, icoords, local_i_start); + get_local_n_2d_by_coords(n, oblks, ocoords, local_no); + get_local_start_2d_by_coords(n, oblks, ocoords, local_o_start); +} + +static void get_local_n_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_n + ) +{ + for(int t=0; t<2; t++) + local_n[t] = PX(local_block_size)(n[t], blks[t], coords[t]); +} + +static void get_local_start_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_start + ) +{ + for(int t=0; t<2; t++) + local_start[t] = PX(local_block_offset)(n[t], blks[t], coords[t]); +} + + + +void PX(default_block_size_2dto1d)( + const INT *n, int p0, int q0, + INT *iblk, INT *oblk + ) +{ + /* n0/(p0*q0) x n1 */ + oblk[0] = PX(global_block_size)(n[0], PFFT_DEFAULT_BLOCK, p0*q0); + oblk[2] = n[2]; + + /* n0/p0 x n1/q0 */ + iblk[0] = oblk[0]*q0; + iblk[1] = PX(global_block_size)(n[1], PFFT_DEFAULT_BLOCK, q0); +} + + +/* allocate array of length 3 for communicators */ +static void split_comms_2dto1d( + MPI_Comm comm_cart_2d, + MPI_Comm *icomms, MPI_Comm *ocomms + ) +{ + int ndims=1, dim_1d, period_1d, reorder=0; + + /* n0/p0 x n1/p1 */ + PX(split_cart_procmesh)(comm_cart_2d, icomms); + + /* n0/(p0 * q0) x n1 */ + PX(split_cart_procmesh_2dto1d_p0q0)(comm_cart_2d, &ocomms[0]); + dim_1d=1; period_1d=1; + MPI_Cart_create(MPI_COMM_SELF, ndims, &dim_1d, &period_1d, reorder, + &ocomms[1]); + +} + +void PX(coords_2dto1d)( + int q0, const int *coords_2d, + int *coords_1d + ) +{ + coords_1d[0] = coords_2d[0]*q0 + coords_2d[1]; +} + +void PX(split_cart_procmesh_2dto1d_p0q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d + ) +{ + int p0, q0=0; + int ndims, coords_2d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_2d) ) + return; + + MPI_Cartdim_get(comm_cart_2d, &ndims); + if(ndims != 2) + return; + + PX(get_mpi_cart_coords)(comm_cart_2d, ndims, coords_2d); + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + /* split into 1 comms of size p0*q0 */ + key = coords_2d[0]*q0 + coords_2d[1]; + color = 0; + MPI_Comm_split(comm_cart_2d, color, key, &comm); + + dim_1d = p0*q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + + +/* implement the splitting to create p0 comms of size q0 */ +void PX(split_cart_procmesh_for_2dto1d_remap_q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d + ) +{ + int p0, q0=0; + int ndims, coords_2d[2]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_2d) ) + return; + + MPI_Cartdim_get(comm_cart_2d, &ndims); + if(ndims != 2) + return; + + PX(get_mpi_cart_coords)(comm_cart_2d, ndims, coords_2d); + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + /* split into p0 comms of size q0 */ + color = coords_2d[0]; + key = coords_2d[1]; + MPI_Comm_split(comm_cart_2d, color, key, &comm); + + dim_1d = q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + +void PX(get_procmesh_dims_1d)( + MPI_Comm comm_cart_2d, + int *p0, int *q0 + ) +{ + int ndims=2, dims[2]; + + PX(get_mpi_cart_dims)(comm_cart_2d, ndims, dims); + *p0 = dims[0]; + *q0 = dims[1]; +} + + + + + + diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index 36a826c..cb5e621 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -284,7 +284,7 @@ remap_nd_plan PX(plan_remap_3dto2d_transposed)( } /* n2/(q0*q1) x n0/p0 x n1/p1 -> n2/q0 x n0/p0 x n1/(p1*q1) */ - /* for each q0, we are looking at a transpose of + /* for each q1 ranks, we are looking at a transpose of * local_ni[1] x (local_nm[2] x local_ni[0]), * The intial partition is along (local_nm[2] x local_ni[0]), * by size iblk[2] x local_ni[0]. From 0e13ca1422db51095be7d9a02bf52e46d9d287c1 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 16:44:35 -0800 Subject: [PATCH 14/35] add interface of 2dto1d in remap.c --- kernel/remap.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/kernel/remap.c b/kernel/remap.c index 45d00e8..9716b1e 100644 --- a/kernel/remap.c +++ b/kernel/remap.c @@ -87,6 +87,15 @@ int PX(local_size_remap_nd_transposed)( local_ni, local_i_start, local_no, local_o_start); } + if(rnk_n == 2) { + return + PX(local_size_remap_2dto1d_transposed)( + rnk_n, n, howmany, + comm_cart, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + } abort(); } @@ -107,6 +116,17 @@ remap_nd_plan PX(plan_remap_nd_transposed)( transp_flag, trafo_flag, opt_flag, io_flag, fftw_flags); } + if(rnk_n == 2) { + remap_nd_plan ths = remap_nd_mkplan(); + + return + PX(plan_remap_2dto1d_transposed)( + ths, + rnk_n, n, howmany, + comm_cart, in, out, + transp_flag, trafo_flag, + opt_flag, io_flag, fftw_flags); + } abort(); } @@ -168,7 +188,8 @@ void PX(execute_remap_nd)( // } // #endif - PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); + if(ths->global_remap[0]) + PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); // #if PFFT_DEBUG_GVARS // local_N[0] = 512/p0; local_N_start[0] = 0; @@ -196,7 +217,8 @@ void PX(execute_remap_nd)( // } // #endif - PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); + if(ths->global_remap[1]) + PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); // #if PFFT_DEBUG_GVARS // local_N[0] = 512/p0/q0; local_N_start[0] = 0; @@ -254,7 +276,8 @@ void PX(remap_nd_rmplan)( for(int t=0; t<2; t++){ PX(sertrafo_rmplan)(ths->local_transp[t]); - PX(gtransp_rmplan)(ths->global_remap[t]); + if (ths->global_remap[t]) + PX(gtransp_rmplan)(ths->global_remap[t]); } /* free memory */ @@ -275,6 +298,11 @@ void PX(remap_nd_split_cart_procmesh)( comms_pm + 1); return; } + if(rnk_n == 2) { + PX(split_cart_procmesh_2dto1d_p0q0)(comm_cart, + comms_pm + 0); + return; + } abort(); } @@ -292,6 +320,14 @@ void PX(remap_nd_get_coords)( (np_pm)[0] = p0*q0; (np_pm)[1] = p1*q1; return; } + if(rnk_n == 2) { + int p0, q0, coords_2d[3]; + PX(get_procmesh_dims_1d)(comm_cart, &p0, &q0); + MPI_Cart_coords(comm_cart, pid, 2, coords_2d); + PX(coords_2dto1d)(q0, coords_2d, coords_pm); + (np_pm)[0] = p0*q0; + return; + } abort(); } @@ -311,6 +347,15 @@ void PX(remap_nd_calculate_blocks)( iblk, mblk, oblk); return; } + if(rnk_n == 3) { + int q0, p0; + INT oblk[3]; + + PX(get_procmesh_dims_1d)(comm_cart, &p0, &q0); + PX(default_block_size_2dto1d)(n, p0, q0, + iblk, oblk); + return; + } abort(); } From adb637e76101862db9ca7573bce0230c89889336 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 16:52:02 -0800 Subject: [PATCH 15/35] At least now the test case runs. But the result is garbage. --- kernel/partrafo.c | 4 ++-- kernel/remap_2dto1d.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/partrafo.c b/kernel/partrafo.c index 6e2ddba..b924d5f 100644 --- a/kernel/partrafo.c +++ b/kernel/partrafo.c @@ -367,9 +367,9 @@ PX(plan) PX(plan_partrafo)( if(rnk_n < rnk_pm) return NULL; - /* equal dimension of FFT and procmesh only implemented for 3 dimensions */ + /* equal dimension of FFT and procmesh only implemented for 3 and 2dimensions */ if(rnk_n == rnk_pm) - if(rnk_n != 3) + if(rnk_n != 3 && rnk_n != 2) return NULL; malloc_and_split_cart_procmesh(rnk_n, transp_flag, comm_cart, diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index 873f0be..aa625b9 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -415,7 +415,7 @@ void PX(default_block_size_2dto1d)( { /* n0/(p0*q0) x n1 */ oblk[0] = PX(global_block_size)(n[0], PFFT_DEFAULT_BLOCK, p0*q0); - oblk[2] = n[2]; + oblk[1] = n[1]; /* n0/p0 x n1/q0 */ iblk[0] = oblk[0]*q0; From 00c0d35ac667bf8ace27454669107bd658841426 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 19:25:09 -0800 Subject: [PATCH 16/35] fix the 2don2d test case on 1x1. --- api/api-basic.c | 4 +- kernel/remap.c | 6 ++- kernel/remap_2dto1d.c | 85 ++++++++++++++++--------------- kernel/remap_3dto2d.c | 8 +++ tests/simple_check_r2c_2d_on_2d.c | 71 ++++++++++++++++++++++---- tests/simple_check_r2c_3d_on_3d.c | 6 +-- 6 files changed, 121 insertions(+), 59 deletions(-) diff --git a/api/api-basic.c b/api/api-basic.c index 415cb78..f2e0262 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -177,7 +177,6 @@ static R check_array( case PFFTI_ARRAYTYPE_HERMITIAN_COMPLEX: err = cabs( ((C*)data)[k] - 0.5 * (d1 + conj(d2))); break; } - if( err > maxerr ) maxerr = err; } @@ -1085,7 +1084,8 @@ static void PX(execute_full)( execute_transposed(r, &ths->serial_trafo[r+1], &ths->global_remap[r], &ths->timer->trafo[r+1], &ths->timer->remap[r], ths->in, ths->out, in, out, ths->comm_cart); - + + PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[1]); PX(execute_remap_nd)(ths->remap_nd[1], ths->in, ths->out, in, out); PFFT_FINISH_TIMING(ths->timer->remap_nd[1]); diff --git a/kernel/remap.c b/kernel/remap.c index 9716b1e..9feb2de 100644 --- a/kernel/remap.c +++ b/kernel/remap.c @@ -160,7 +160,8 @@ void PX(execute_remap_nd)( // #endif /* execute all initialized plans */ - PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); + if(ths->local_transp[0]) + PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); // #if PFFT_DEBUG_GVARS // local_N[0] = 512/p0; local_N_start[0] = 0; @@ -246,7 +247,8 @@ void PX(execute_remap_nd)( // } // #endif - PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); + if(ths->local_transp[1]) + PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); } diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index aa625b9..b42707e 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -165,6 +165,7 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) * for each q0 ranks, a transpose of a matrix n1 x n0/p0, * from divided along n1, to along n0/p0 @@ -172,16 +173,20 @@ int PX(local_size_remap_2dto1d_transposed)( * */ N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ N1 = local_no[1]; h1 = 1; /* n1 */ - blk0 = oblk[1]; /* n0/(p0*q0) */ + blk0 = oblk[0]; /* n0/(p0*q0) */ blk1 = iblk[1]; /* n1 / q0 */ hm = 1; /* set hm to 1 since mem will be in units of real/complex */ + pfft_fprintf(MPI_COMM_WORLD, stderr, "sizing N0 = %td N1 = %td, blk0 = %td blk1 = %td\n", N0, N1, blk0, blk1); + PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); mem_tmp = PX(local_size_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, comm_q0); mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q0); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global = %td\n", mem_tmp); + /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ nb = local_no[1]; nt = local_no[0]; @@ -190,6 +195,8 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); + /* take care of transposed data ordering */ if(transp_flag & PFFT_TRANSPOSED_OUT){ get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, @@ -244,67 +251,63 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( /* n0/p0 x n1/p1 > n1/q0 x n0/p0 */ nb = local_ni[0]; nt = local_ni[1]; + + ths->local_transp[0] = NULL; + ths->local_transp[1] = NULL; + ths->global_remap[0] = NULL; + ths->global_remap[1] = NULL; /* plan TRANSPOSED_IN in opposite direction */ if(transp_flag & PFFT_TRANSPOSED_IN){ - if(~io_flag & PFFT_DESTROY_INPUT) - in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); } else { - ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); - if(~io_flag & PFFT_DESTROY_INPUT) - in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ } - if(transp_flag & PFFT_TRANSPOSED_IN) - ths->global_remap[1] = NULL; - else - ths->global_remap[0] = NULL; - - /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) */ + /* n1/q0 x n0/p0 -> n0/(p0*q0) x n1 */ N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ N1 = local_no[1]; h1 = 1; /* n1 */ - blk0 = oblk[1]; /* n0/(p0*q0) */ + blk0 = oblk[0]; /* n0/(p0*q0) */ blk1 = iblk[1]; /* n1 / q0 */ hm = howmany * (trafo_flag & PFFTI_TRAFO_C2C ? 2 : 1); + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_ni = %td %td\n", local_ni[0], local_ni[1]); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_no = %td %td\n", local_no[0], local_no[1]); PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); - if(transp_flag & PFFT_TRANSPOSED_IN) + + if(transp_flag & PFFT_TRANSPOSED_IN) { + R * tmp; + /* compute in-place plans on 'out' in order to preserve inputs, + * global transp is preferable outof place */ + ths->global_remap[0] = PX(plan_global_transp)( N1, N0, h1, h0, hm, blk1, blk0, - comm_q0, out, in, PFFT_TRANSPOSED_OUT, fftw_flags); - else - ths->global_remap[1] = PX(plan_global_transp)( - N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, in, out, PFFT_TRANSPOSED_IN, fftw_flags); - MPI_Comm_free(&comm_q0); + comm_q0, in, out, PFFT_TRANSPOSED_OUT, fftw_flags); - /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ - nb = local_no[1]; - nt = local_no[0]; - - if(transp_flag & PFFT_TRANSPOSED_IN){ - if(~io_flag & PFFT_DESTROY_INPUT){ - /* restore pointers to 'in_user' and 'out_user' in order to compute the first step out-of-place */ - in = in_user; - } - - ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); + } else { - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, + R * tmp; + if(io_flag & PFFT_DESTROY_INPUT) + tmp = in; + else; + /* default: compute in-place plans on 'out' in order to preserve inputs */ + tmp = out; + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, tmp, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); + + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, tmp, out, PFFT_TRANSPOSED_IN, fftw_flags); + } + MPI_Comm_free(&comm_q0); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, N0 = %td N1 = %td, blk0 = %td blk1 = %td hm=%td transp_flag %d\n", N0, N1, blk0, blk1, hm, transp_flag ); /* free communicators */ free_three_comms(icomms); diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index cb5e621..013b56d 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -176,6 +176,8 @@ int PX(local_size_remap_3dto2d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); + /* n2/(q0*q1) x n0/p0 x n1/p1 -> n2/q0 x n0/p0 x n1/(p1*q1) */ N0 = local_ni[1]; h0 = 1; N1 = local_nm[2]; h1 = local_ni[0]; @@ -189,6 +191,8 @@ int PX(local_size_remap_3dto2d_transposed)( mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q1); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global1 = %td\n", mem_tmp); + /* n2/q0 x n0/p0 x n1/(p1*q1) -> n2 x n0/(p0*q0) x n1/(p1*q1) */ N0 = local_nm[0]; h0 = local_nm[1]; N1 = local_no[2]; h1 = 1; @@ -202,6 +206,8 @@ int PX(local_size_remap_3dto2d_transposed)( mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q0); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global2 = %td\n", mem_tmp); + /* n2 x n0/(p0*q0) x n1/(p1*q1) -> n0/(p0*q0) x n1/(p1*q1) x n2 */ nb = local_no[2]; nt = local_no[0] * local_no[1]; @@ -210,6 +216,8 @@ int PX(local_size_remap_3dto2d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); + pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); + /* take care of transposed data ordering */ if(transp_flag & PFFT_TRANSPOSED_OUT){ get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c index 3443e48..908dab7 100644 --- a/tests/simple_check_r2c_2d_on_2d.c +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -1,6 +1,26 @@ #include #include +static void printarray(double * a, int n0, int n1) +{ + for(ptrdiff_t l=0; l < n0 * n1; l++) { + pfft_fprintf(MPI_COMM_WORLD, stderr, "%4.2f ", a[l]); + if ((l + 1) % n1 == 0) { + pfft_fprintf(MPI_COMM_WORLD, stderr, "\n", a[l]); + } + } +} +static void gatherarray(double * a, double * in, int local_start0, + int local_n0, int n0, int local_start1, int local_n1, int n1) +{ + memset(a, 0, sizeof(double) * n0 * n1); + for(ptrdiff_t l=0; l < local_n0 * local_n1; l++) { + int x = l / local_n1, y = l % local_n1; + a[(local_start0 + x) * n1 + local_start1 + y] = in[l]; + } + MPI_Allreduce(MPI_IN_PLACE, a, n0 * n1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); +} + int main(int argc, char **argv) { int np[2]; @@ -14,9 +34,12 @@ int main(int argc, char **argv) MPI_Comm comm_cart_2d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; - np[0] = 2; np[1] = 2; + n[0] = 4; n[1] = 4; + np[0] = 1; np[1] = 1; + double *truein = calloc(sizeof(double), n[0] * n[1]); + double *trueout = calloc(sizeof(double), n[0] * (n[1] / 2 + 1) * 2); + /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); pfft_init(); @@ -32,44 +55,70 @@ int main(int argc, char **argv) alloc_local = pfft_local_size_dft_r2c(2, n, comm_cart_2d, PFFT_TRANSPOSED_NONE, local_ni, local_i_start, local_no, local_o_start); - pfft_fprintf(MPI_COMM_WORLD, stderr, "%td.\n", alloc_local); - /* Allocate memory */ in = pfft_alloc_real(2 * alloc_local); out = pfft_alloc_complex(alloc_local); + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning forward\n"); /* Plan parallel forward FFT */ plan_forw = pfft_plan_dft_r2c(2, - n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning backward\n"); /* Plan parallel backward FFT */ plan_back = pfft_plan_dft_c2r(2, - n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); /* Initialize input with random numbers */ pfft_init_input_real(2, n, local_ni, local_i_start, in); + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) { + in[l] = l; +// if(l == 0) +// in[l] = 1; + } + gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + printarray(truein, n[0], n[1]); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running forward\n"); /* execute parallel forward FFT */ pfft_execute(plan_forw); + fprintf(stderr, "%td %td %td %td\n", local_o_start[0], local_no[0], local_o_start[1], local_no[1]); + + gatherarray(trueout, out, local_o_start[0], local_no[0], n[0], + 2 * local_o_start[1], + 2 * local_no[1], 2 *(n[1] / 2 + 1)); + + printarray(trueout, n[0], (n[1] / 2 + 1) * 2); + //gatherarray(truein, out, local_i_start[0], local_ni[0], n[0], + // local_i_start[1], local_ni[1], n[1]); + //printarray(truein, n[0], n[1]); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); /* clear the old input */ pfft_clear_input_real(2, n, local_ni, local_i_start, in); - /* execute parallel backward FFT */ pfft_execute(plan_back); - + /* Scale data */ for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) in[l] /= (n[0]*n[1]); - + + gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + printarray(truein, n[0], n[1]); + +#if 0 /* Print error of back transformed data */ MPI_Barrier(MPI_COMM_WORLD); err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_2d); - pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1]); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td):\n", n[0], n[1]); pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); - + #endif /* free mem and finalize */ pfft_destroy_plan(plan_forw); pfft_destroy_plan(plan_back); diff --git a/tests/simple_check_r2c_3d_on_3d.c b/tests/simple_check_r2c_3d_on_3d.c index 8007f3f..f722c24 100644 --- a/tests/simple_check_r2c_3d_on_3d.c +++ b/tests/simple_check_r2c_3d_on_3d.c @@ -14,7 +14,7 @@ int main(int argc, char **argv) MPI_Comm comm_cart_3d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; n[2] = 31; + n[0] = 8; n[1] = 8; n[2] = 8; np[0] = 2; np[1] = 2; np[2] = 2; /* Initialize MPI and PFFT */ @@ -38,11 +38,11 @@ int main(int argc, char **argv) /* Plan parallel forward FFT */ plan_forw = pfft_plan_dft_r2c_3d( - n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); /* Plan parallel backward FFT */ plan_back = pfft_plan_dft_c2r_3d( - n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); /* Initialize input with random numbers */ pfft_init_input_real(3, n, local_ni, local_i_start, From 8f1f3d7e9ef938b7c08a02ae2dbf5b3fc31bbd0b Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 21:28:20 -0800 Subject: [PATCH 17/35] update tests. --- tests/simple_check_r2c_2d_on_2d.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c index 908dab7..6564e19 100644 --- a/tests/simple_check_r2c_2d_on_2d.c +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -4,7 +4,8 @@ static void printarray(double * a, int n0, int n1) { for(ptrdiff_t l=0; l < n0 * n1; l++) { - pfft_fprintf(MPI_COMM_WORLD, stderr, "%4.2f ", a[l]); +// pfft_fprintf(MPI_COMM_WORLD, stderr, "%04.0f, ", a[l]); + pfft_fprintf(MPI_COMM_WORLD, stderr, "%05.1f, ", a[l]); if ((l + 1) % n1 == 0) { pfft_fprintf(MPI_COMM_WORLD, stderr, "\n", a[l]); } @@ -13,10 +14,14 @@ static void printarray(double * a, int n0, int n1) static void gatherarray(double * a, double * in, int local_start0, int local_n0, int n0, int local_start1, int local_n1, int n1) { + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + memset(a, 0, sizeof(double) * n0 * n1); for(ptrdiff_t l=0; l < local_n0 * local_n1; l++) { int x = l / local_n1, y = l % local_n1; a[(local_start0 + x) * n1 + local_start1 + y] = in[l]; +// + pid * 1000; } MPI_Allreduce(MPI_IN_PLACE, a, n0 * n1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } @@ -35,15 +40,17 @@ int main(int argc, char **argv) /* Set size of FFT and process mesh */ n[0] = 4; n[1] = 4; - np[0] = 1; np[1] = 1; + np[0] = 2; np[1] = 2; double *truein = calloc(sizeof(double), n[0] * n[1]); double *trueout = calloc(sizeof(double), n[0] * (n[1] / 2 + 1) * 2); + int pid; /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); pfft_init(); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); /* Create three-dimensional process grid of size np[0] x np[1], if possible */ if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_2d) ){ pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]); @@ -74,9 +81,9 @@ int main(int argc, char **argv) in); for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) { - in[l] = l; -// if(l == 0) -// in[l] = 1; + in[l] = pid * 100 + l; + // if(l == 0) + // in[l] = 1; } gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], local_i_start[1], local_ni[1], n[1]); @@ -86,16 +93,19 @@ int main(int argc, char **argv) /* execute parallel forward FFT */ pfft_execute(plan_forw); - fprintf(stderr, "%td %td %td %td\n", local_o_start[0], local_no[0], local_o_start[1], local_no[1]); + pfft_fprintf(MPI_COMM_WORLD, stderr, "as real \n"); + gatherarray(truein, out, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + printarray(truein, n[0], n[1]); + + + pfft_fprintf(MPI_COMM_WORLD, stderr, "as complex \n"); gatherarray(trueout, out, local_o_start[0], local_no[0], n[0], 2 * local_o_start[1], 2 * local_no[1], 2 *(n[1] / 2 + 1)); printarray(trueout, n[0], (n[1] / 2 + 1) * 2); - //gatherarray(truein, out, local_i_start[0], local_ni[0], n[0], - // local_i_start[1], local_ni[1], n[1]); - //printarray(truein, n[0], n[1]); pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); /* clear the old input */ @@ -112,13 +122,12 @@ int main(int argc, char **argv) local_i_start[1], local_ni[1], n[1]); printarray(truein, n[0], n[1]); -#if 0 /* Print error of back transformed data */ MPI_Barrier(MPI_COMM_WORLD); err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_2d); pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td):\n", n[0], n[1]); pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); - #endif + /* free mem and finalize */ pfft_destroy_plan(plan_forw); pfft_destroy_plan(plan_back); From ccda6c7f4741deee2fb46056b0aaff839d9fbd1d Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 22:47:05 -0800 Subject: [PATCH 18/35] c2c is correct but c of r2c is wrong. however the round trip is correct. So looks like we are doing some transpose incorrectly but consistently. --- api/api-basic.c | 4 +++- kernel/remap_2dto1d.c | 28 ++++++++++------------ tests/Makefile.am | 3 ++- tests/simple_check_r2c_2d_on_2d.c | 40 ++++++++++++++++++++++++------- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/api/api-basic.c b/api/api-basic.c index f2e0262..1cb8e8d 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -1085,9 +1085,10 @@ static void PX(execute_full)( execute_transposed(r, &ths->serial_trafo[r+1], &ths->global_remap[r], &ths->timer->trafo[r+1], &ths->timer->remap[r], ths->in, ths->out, in, out, ths->comm_cart); - PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[1]); + PX(execute_remap_nd)(ths->remap_nd[1], ths->in, ths->out, in, out); + PFFT_FINISH_TIMING(ths->timer->remap_nd[1]); /* twiddle outputs in order to get inputs shifted by n/2 */ @@ -1295,6 +1296,7 @@ static void execute_transposed( { int t; + pfft_fprintf(MPI_COMM_WORLD, stderr, "executing, rnk_pm = %d\nb", rnk_pm); for(t=0; tglobal_remap[1] = NULL; /* plan TRANSPOSED_IN in opposite direction */ - if(transp_flag & PFFT_TRANSPOSED_IN){ - } else { - } /* n1/q0 x n0/p0 -> n0/(p0*q0) x n1 */ N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ @@ -279,30 +276,31 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( /* compute in-place plans on 'out' in order to preserve inputs, * global transp is preferable outof place */ - ths->global_remap[0] = PX(plan_global_transp)( - N1, N0, h1, h0, hm, blk1, blk0, - comm_q0, in, out, PFFT_TRANSPOSED_OUT, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT) + in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, out, 0, NULL, + nb, 1, &nt, howmany, out, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, in, out, PFFT_TRANSPOSED_OUT, fftw_flags); + + } else { - R * tmp; - if(io_flag & PFFT_DESTROY_INPUT) - tmp = in; - else; - /* default: compute in-place plans on 'out' in order to preserve inputs */ - tmp = out; ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, tmp, 0, NULL, + nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT) + in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ + ths->global_remap[1] = PX(plan_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, tmp, out, PFFT_TRANSPOSED_IN, fftw_flags); + comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); } MPI_Comm_free(&comm_q0); diff --git a/tests/Makefile.am b/tests/Makefile.am index 6482188..2689a7c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -107,7 +107,8 @@ check_PROGRAMS += \ simple_check_r2c_3d_on_3d_transposed_many check_PROGRAMS += \ - simple_check_r2c_2d_on_2d + simple_check_r2c_2d_on_2d \ + simple_check_c2c_2d_on_2d check_PROGRAMS += \ simple_check_ousam_c2c simple_check_ousam_c2c_transposed \ diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c index 6564e19..65af409 100644 --- a/tests/simple_check_r2c_2d_on_2d.c +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -5,9 +5,10 @@ static void printarray(double * a, int n0, int n1) { for(ptrdiff_t l=0; l < n0 * n1; l++) { // pfft_fprintf(MPI_COMM_WORLD, stderr, "%04.0f, ", a[l]); - pfft_fprintf(MPI_COMM_WORLD, stderr, "%05.1f, ", a[l]); +// pfft_fprintf(MPI_COMM_WORLD, stderr, "%05.1f, ", a[l]); + fprintf(stderr, "%g, ", a[l]); if ((l + 1) % n1 == 0) { - pfft_fprintf(MPI_COMM_WORLD, stderr, "\n", a[l]); + fprintf(stderr, "\n", a[l]); } } } @@ -39,8 +40,8 @@ int main(int argc, char **argv) MPI_Comm comm_cart_2d; /* Set size of FFT and process mesh */ - n[0] = 4; n[1] = 4; - np[0] = 2; np[1] = 2; + n[0] = 4; n[1] = 2; + np[0] = 1; np[1] = 2; double *truein = calloc(sizeof(double), n[0] * n[1]); double *trueout = calloc(sizeof(double), n[0] * (n[1] / 2 + 1) * 2); @@ -81,13 +82,14 @@ int main(int argc, char **argv) in); for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) { - in[l] = pid * 100 + l; - // if(l == 0) + // in[l] = 0; + // if(l == 1) // in[l] = 1; } gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], local_i_start[1], local_ni[1], n[1]); - printarray(truein, n[0], n[1]); + if(pid == 0) + printarray(truein, n[0], n[1]); pfft_fprintf(MPI_COMM_WORLD, stderr, "running forward\n"); /* execute parallel forward FFT */ @@ -97,15 +99,34 @@ int main(int argc, char **argv) pfft_fprintf(MPI_COMM_WORLD, stderr, "as real \n"); gatherarray(truein, out, local_i_start[0], local_ni[0], n[0], local_i_start[1], local_ni[1], n[1]); + if(pid == 0) printarray(truein, n[0], n[1]); pfft_fprintf(MPI_COMM_WORLD, stderr, "as complex \n"); + + int kk = 0; + for(kk = 0; kk < n[1] * n[0]; kk++) { + MPI_Barrier(MPI_COMM_WORLD); + if(kk == pid) { + fprintf(stderr, "pid = %d, %td %td %td %td\n", pid, + local_o_start[0], local_o_start[1], + local_no[0], local_no[1]); + + printarray(out, local_no[0], local_no[1] * 2); + } + } + + MPI_Barrier(MPI_COMM_WORLD); gatherarray(trueout, out, local_o_start[0], local_no[0], n[0], 2 * local_o_start[1], 2 * local_no[1], 2 *(n[1] / 2 + 1)); - printarray(trueout, n[0], (n[1] / 2 + 1) * 2); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "global result\n"); + + if(pid == 0) + printarray(trueout, n[0], (n[1] / 2 + 1) * 2); pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); /* clear the old input */ @@ -120,7 +141,8 @@ int main(int argc, char **argv) gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], local_i_start[1], local_ni[1], n[1]); - printarray(truein, n[0], n[1]); + if(pid == 0) + printarray(truein, n[0], n[1]); /* Print error of back transformed data */ MPI_Barrier(MPI_COMM_WORLD); From 516f202b3b3cdc47eb9ab8145de4c6cd609061f3 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Wed, 17 Jan 2018 23:00:25 -0800 Subject: [PATCH 19/35] Actually c2c also doesn't work on 4x4 with 1x2. --- api/api-basic.c | 1 + tests/simple_check_r2c_2d_on_2d.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/api/api-basic.c b/api/api-basic.c index 1cb8e8d..3f2977b 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -1079,6 +1079,7 @@ static void PX(execute_full)( PX(execute_remap_nd)(ths->remap_nd[0], ths->in, ths->out, in, out); PFFT_FINISH_TIMING(ths->timer->remap_nd[0]); + execute_transposed(r, ths->serial_trafo, ths->global_remap, ths->timer->trafo, ths->timer->remap, ths->in, ths->out, in, out, ths->comm_cart); diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c index 65af409..ccff241 100644 --- a/tests/simple_check_r2c_2d_on_2d.c +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -40,7 +40,7 @@ int main(int argc, char **argv) MPI_Comm comm_cart_2d; /* Set size of FFT and process mesh */ - n[0] = 4; n[1] = 2; + n[0] = 4; n[1] = 4; np[0] = 1; np[1] = 2; double *truein = calloc(sizeof(double), n[0] * n[1]); @@ -82,7 +82,7 @@ int main(int argc, char **argv) in); for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) { - // in[l] = 0; + // in[l] = pid * 10 + l; // if(l == 1) // in[l] = 1; } From e84ff0b6965ec4123a416a2fdfc781ac794c7cef Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 18 Jan 2018 09:18:07 -0800 Subject: [PATCH 20/35] add gather and print helpers. --- api/pfft.h | 15 ++++++++++-- util/util.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/api/pfft.h b/api/pfft.h index 856c2fe..d36841e 100644 --- a/api/pfft.h +++ b/api/pfft.h @@ -499,8 +499,19 @@ BEGIN_C_DECLS PFFT_EXTERN PX(gctimer) PX(convert_vec2gctimer)( \ const double *times); \ PFFT_EXTERN void PX(destroy_gctimer)( \ - PX(gctimer) ths); - + PX(gctimer) ths); \ + \ + PFFT_EXTERN R * PX(gather_array)(const int rnk_n, const int howmany, \ + const R * local, \ + const INT* local_start, \ + const INT * local_n, \ + const INT * global_n, \ + MPI_Comm comm); \ + void PX(print_array)(int rnk_n, \ + int howmany, \ + R * a, \ + const INT * ni, \ + MPI_Comm comm); \ #define PFFT_MANGLE_DOUBLE(name) PFFT_CONCAT(pfft_, name) diff --git a/util/util.c b/util/util.c index d0f25bd..17fcabc 100644 --- a/util/util.c +++ b/util/util.c @@ -248,3 +248,69 @@ unsigned* PX(malloc_unsigned)( return (unsigned*) malloc(sizeof(unsigned) * size); } +R * PX(gather_array)(const int rnk_n, const int howmany, + const R * local, + const INT* local_start, + const INT * local_n, + const INT * global_n, + MPI_Comm comm) +{ + INT * global_strides = malloc(sizeof(INT) * rnk_n); + INT * offset = malloc(sizeof(INT) * rnk_n); + INT * local_strides = malloc(sizeof(INT) * rnk_n); + + size_t gn = 1, ln = 1; + int i; + for(i = rnk_n - 1; i >=0 ; i --) { + global_strides[i] = gn; + local_strides[i] = ln; + gn *= global_n[i]; + ln *= local_n[i]; + } + R * r = PX(malloc)(sizeof(R) * gn * howmany); + memset(r, 0, (sizeof(R) * gn * howmany)); + for(i = 0; i < ln; i ++) { + INT gi = 0; + INT li = i; + int d; + for(d = 0; d < rnk_n ; d++) { + offset[d] = 0; + offset[d] = (li / local_strides[d]) % local_n[d]; + gi += (offset[d] + local_start[d]) * global_strides[d]; + } + + for(d = 0; d < howmany; d ++) { + r[gi * howmany + d] = local[li * howmany + d]; + } + // printf("%td -> %td (%td) %g\n", li, gi, gn, local[li * howmany]); + } + MPI_Allreduce(MPI_IN_PLACE, r, gn * howmany, PFFT_MPI_REAL_TYPE, MPI_SUM, comm); + free(local_strides); + free(offset); + return r; +} + +void PX(print_array)(int rnk_n, + int howmany, + R * a, + const INT * ni, + MPI_Comm comm) +{ + int i; + size_t gn = 1; + for(i = 0; i < rnk_n; i ++) { + gn *= ni[i]; + } + size_t nl = gn / ni[0]; /* line break after first dim */ + + for(ptrdiff_t l=0; l < gn; l++) { + int d; + for(d = 0; d < howmany; d ++) { + pfft_fprintf(comm, stderr, "%g, ", a[l * howmany + d]); + } + if ((l + 1) % nl == 0) { + pfft_fprintf(MPI_COMM_WORLD, stderr, "\n"); + } + } +} + From 94f9ae3c6749fa2f04fb44e7eaca9c3dd803c3f8 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 18 Jan 2018 09:29:33 -0800 Subject: [PATCH 21/35] rename members of fftwplan for better understanding. --- kernel/ipfft.h | 7 ++-- kernel/sertrafo.c | 91 +++++++++++++++++++++++++--------------------- kernel/transpose.c | 24 ++++++------ 3 files changed, 65 insertions(+), 57 deletions(-) diff --git a/kernel/ipfft.h b/kernel/ipfft.h index c24d69c..a2ff314 100644 --- a/kernel/ipfft.h +++ b/kernel/ipfft.h @@ -252,11 +252,12 @@ typedef sertrafo_dbg_s *sertrafo_dbg; /* a fftw plan */ typedef void (* PX(fftw_execute))(const X(plan) , R * in, R * out); typedef struct { - X(plan) plan; - R *planned_in; /* in and out array used at the time of plannning */ - R *planned_out; + X(plan) fftw; + R *in; /* in and out array used at the time of plannning */ + R *out; PX(fftw_execute) execute; } PX(fftw_plan); + /* this function is put in sertrafo.c, it should be in a different * file; as it is also used in transpose.c */ void PX(execute_fftw_plan)( diff --git a/kernel/sertrafo.c b/kernel/sertrafo.c index 4dadde8..1f7b616 100644 --- a/kernel/sertrafo.c +++ b/kernel/sertrafo.c @@ -313,7 +313,7 @@ static sertrafo_plan plan_remap_only( plan_remap(&ths->plan[0], nb, rnk, n, howmany, in, out, trafo_flag, transp_flag, fftw_flags, PFFTI_ARRAY_INPUT); #endif - ths->plan[1].plan = NULL; + ths->plan[1].fftw = NULL; return ths; } @@ -443,7 +443,7 @@ static sertrafo_plan plan_sertrafo_pt( /* only measure times if we need them for comparison to plan1 */ if(opt_flag & PFFT_TUNE) - time0 = measure_time(ths0->plan[0].plan) + measure_time(ths0->plan[1].plan); + time0 = measure_time(ths0->plan[0].fftw) + measure_time(ths0->plan[1].fftw); /* this plan differs for altered strides or out-of-place */ ths1 = sertrafo_mkplan(); @@ -470,7 +470,7 @@ static sertrafo_plan plan_sertrafo_pt( nb, rnk, n, howmany, in1, out1, sign, kinds, trafo_flag, transp_flag1, fftw_flags PFFT_DEBUG_SERTRAFO_PTR11); } - time1 = measure_time(ths1->plan[0].plan) + measure_time(ths1->plan[1].plan); + time1 = measure_time(ths1->plan[0].fftw) + measure_time(ths1->plan[1].fftw); } } @@ -501,12 +501,12 @@ static void plan_trafo( /* R2C can not combine trafo and transposition */ if( (trafo_flag & PFFTI_TRAFO_RDFT) && needs_transpose(transp_flag) ) { - plan->plan = NULL; + plan->fftw = NULL; return; } /* R2R can not combine trafo and transposition */ if( (trafo_flag & PFFTI_TRAFO_R2R) && needs_transpose(transp_flag) ) { - plan->plan = NULL; + plan->fftw = NULL; return; } malloc_and_fill_dims_trafo( @@ -515,32 +515,32 @@ static void plan_trafo( /* choose appropriate fftw planner for trafo */ if(trafo_flag & PFFTI_TRAFO_R2C){ - plan->plan = X(plan_guru64_dft_r2c)( + plan->fftw = X(plan_guru64_dft_r2c)( dims_rnk, dims, howmany_rnk, howmany_dims, in, (C*) out, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft_r2c); } else if(trafo_flag & PFFTI_TRAFO_C2R){ - plan->plan = X(plan_guru64_dft_c2r)( + plan->fftw = X(plan_guru64_dft_c2r)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, out, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft_c2r); } else if(trafo_flag & PFFTI_TRAFO_R2R){ - plan->plan = X(plan_guru64_r2r)( + plan->fftw = X(plan_guru64_r2r)( dims_rnk, dims, howmany_rnk, howmany_dims, in, out, kinds, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_r2r); } else { - plan->plan = X(plan_guru64_dft)( + plan->fftw = X(plan_guru64_dft)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, (C*) out, sign, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft); } @@ -622,18 +622,18 @@ static void plan_remap( /* choose appropriate fftw planner for remap */ if(trafo_flag & PFFTI_TRAFO_C2C) { - plan->plan = X(plan_guru64_dft)( + plan->fftw = X(plan_guru64_dft)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, (C*) out, sign, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft); } else { - plan->plan = X(plan_guru64_r2r)( + plan->fftw = X(plan_guru64_r2r)( dims_rnk, dims, howmany_rnk, howmany_dims, in, out, kinds, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_r2r); } #if PFFT_DEBUG_SERTRAFO @@ -857,7 +857,7 @@ static sertrafo_plan sertrafo_mkplan( /* initialize to NULL for easy checks */ for(int t=0; t<2; t++) - ths->plan[t].plan = NULL; + ths->plan[t].fftw= NULL; /* initialize debug info */ #if PFFT_DEBUG_SERTRAFO @@ -878,8 +878,8 @@ void PX(sertrafo_rmplan)( /* take care of unsuccessful FFTW planing */ for(int t=0; t<2; t++) - if(ths->plan[t].plan != NULL) - X(destroy_plan)(ths->plan[t].plan); + if(ths->plan[t].fftw != NULL) + X(destroy_plan)(ths->plan[t].fftw); #if PFFT_DEBUG_SERTRAFO for(int t=0; t<2; t++) @@ -1064,10 +1064,17 @@ void PX(execute_fftw_plan)( R *executed_in, R *executed_out) { - R *in = (fftwplan->planned_in == planned_in) ? executed_in : executed_out; - R *out = (fftwplan->planned_out == planned_out) ? executed_out : executed_in; + R *in = NULL; // = (fftwplan->planned_in == planned_in) ? executed_in : executed_out; + R *out = NULL; //= (fftwplan->planned_out == planned_out) ? executed_out : executed_in; - (fftwplan->execute)(fftwplan->plan, in, out); + if(fftwplan->in == planned_in) in = executed_in; + if(fftwplan->in == planned_out) in = executed_out; + if(fftwplan->out == planned_in) out = executed_in; + if(fftwplan->out == planned_out) out = executed_out; + + if(in == NULL) abort(); + if(out == NULL) abort(); + (fftwplan->execute)(fftwplan->fftw, in, out); } void PX(execute_sertrafo)( @@ -1090,7 +1097,7 @@ void PX(execute_sertrafo)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ - if(ths->plan[0].plan != NULL){ + if(ths->plan[0].fftw != NULL){ fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan0\n", counter); print_dbg(ths->dbg[0]); } else @@ -1102,11 +1109,11 @@ void PX(execute_sertrafo)( /* Checksum inputs */ lsum=0.0; - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) for(INT k=0; kdbg[0]->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in0) = %e\n", counter, gsum); // #if PFFT_DEBUG_GVARS @@ -1148,16 +1155,16 @@ void PX(execute_sertrafo)( // #endif /* Serial trafo */ - if(ths->plan[0].plan != NULL) { + if(ths->plan[0].fftw != NULL) { PX(execute_fftw_plan)(&ths->plan[0], planned_in, planned_out, executed_in, executed_out); } /* Checksum outputs */ lsum=0.0; - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) for(INT k=0; kdbg[0]->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out0) = %e - Value may change\n", counter, gsum); // #if PFFT_DEBUG_GVARS @@ -1202,7 +1209,7 @@ void PX(execute_sertrafo)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ - if(ths->plan[1].plan != NULL){ + if(ths->plan[1].fftw != NULL){ fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan1\n", counter); print_dbg(ths->dbg[1]); } else @@ -1213,24 +1220,24 @@ void PX(execute_sertrafo)( /* Checksum inputs */ lsum=0.0; - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) for(INT k=0; kdbg[1]->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in1) = %e - Value may change.\n", counter, gsum); /* Serial trafo */ - if(ths->plan[1].plan != NULL) { + if(ths->plan[1].fftw != NULL) { PX(execute_fftw_plan)(&ths->plan[1], planned_in, planned_out, executed_in, executed_out); } /* Checksum outputs */ lsum=0.0; - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) for(INT k=0; kdbg[1]->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out1) = %e\n", counter, gsum); @@ -1238,7 +1245,7 @@ void PX(execute_sertrafo)( #else /* execute all initialized serfft plans */ for(int t=0; t<2; t++) { - if(ths->plan[t].plan != NULL) + if(ths->plan[t].fftw != NULL) PX(execute_fftw_plan)(&ths->plan[t], planned_in, planned_out, executed_in, executed_out); } #endif diff --git a/kernel/transpose.c b/kernel/transpose.c index 72931f1..b9d3cf8 100644 --- a/kernel/transpose.c +++ b/kernel/transpose.c @@ -171,11 +171,11 @@ gtransp_plan PX(plan_global_transp)( ths->dbg = gtransp_mkdbg(N, hm, blk, in, out, comm, fftw_flags); #endif - ths->plan.plan = XM(plan_many_transpose)( + ths->plan.fftw = XM(plan_many_transpose)( N[0], N[1], hm, blk[0], blk[1], in, out, comm, fftw_flags); - ths->plan.planned_in = in; - ths->plan.planned_out = out; + ths->plan.in = in; + ths->plan.out = out; ths->plan.execute = (PX(fftw_execute))(XM(execute_r2r)); return ths; } @@ -188,7 +188,7 @@ static gtransp_plan gtransp_mkplan( gtransp_plan ths = (gtransp_plan) malloc(sizeof(gtransp_plan_s)); /* initialize to NULL for easy checks */ - ths->plan.plan=NULL; + ths->plan.fftw = NULL; /* initialize debug info */ #if PFFT_DEBUG_GTRANSP @@ -207,8 +207,8 @@ void PX(gtransp_rmplan)( return; /* take care of unsuccessful FFTW planing */ - if(ths->plan.plan != NULL) - X(destroy_plan)(ths->plan.plan); + if(ths->plan.fftw != NULL) + X(destroy_plan)(ths->plan.fftw); #if PFFT_DEBUG_GTRANSP if(ths->dbg != NULL) @@ -237,7 +237,7 @@ void PX(execute_gtransp)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ if(ths != NULL){ - if(ths->plan.plan != NULL){ + if(ths->plan.fftw != NULL){ fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d\n", counter); print_dbg(ths->dbg); } else @@ -250,18 +250,18 @@ void PX(execute_gtransp)( /* Checksum inputs */ lsum=0.0; if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) for(INT k=0; kdbg->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d, Checksum(in) = %e\n", counter, gsum); #endif /* Global transposition */ if(ths != NULL) - if(ths->plan.plan != NULL) { + if(ths->plan.fftw != NULL) { PX(execute_fftw_plan)(&ths->plan, planned_in, planned_out, executed_in, executed_out); } @@ -271,12 +271,12 @@ void PX(execute_gtransp)( /* Checksum outputs */ lsum=0.0; if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) for(INT k=0; kdbg->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d, Checksum(out) = %e\n", counter, gsum); // if(counter==3){ From 29153504d1724eb9bcec47549935b9fec905a291 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 18 Jan 2018 10:08:21 -0800 Subject: [PATCH 22/35] commit c2c 2don2d test. --- tests/simple_check_c2c_2d_on_2d.c | 107 ++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/simple_check_c2c_2d_on_2d.c diff --git a/tests/simple_check_c2c_2d_on_2d.c b/tests/simple_check_c2c_2d_on_2d.c new file mode 100644 index 0000000..bbebaeb --- /dev/null +++ b/tests/simple_check_c2c_2d_on_2d.c @@ -0,0 +1,107 @@ +#include +#include + +int main(int argc, char **argv) +{ + int np[2]; + ptrdiff_t n[2]; + ptrdiff_t alloc_local; + ptrdiff_t local_ni[2], local_i_start[2]; + ptrdiff_t local_no[2], local_o_start[2]; + ptrdiff_t global_ni[2]; + ptrdiff_t global_no[2]; + + double err; + pfft_complex *in, *out; + pfft_plan plan_forw=NULL, plan_back=NULL; + MPI_Comm comm_cart_2d; + + /* Set size of FFT and process mesh */ + n[0] = 4; n[1] = 4; + np[0] = 2; np[1] = 1; + global_no[0] = n[0]; + global_no[1] = n[1]; + global_ni[0] = n[0]; + global_ni[1] = n[1]; + + double *truein; + double *trueout; + int pid; + + /* Initialize MPI and PFFT */ + MPI_Init(&argc, &argv); + pfft_init(); + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + /* Create three-dimensional process grid of size np[0] x np[1], if possible */ + if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_2d) ){ + pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]); + MPI_Finalize(); + return 1; + } + + /* Get parameters of data distribution */ + alloc_local = pfft_local_size_dft(2, n, comm_cart_2d, PFFT_TRANSPOSED_NONE, + local_ni, local_i_start, local_no, local_o_start); + + /* Allocate memory */ + in = pfft_alloc_complex(alloc_local); + out = pfft_alloc_complex(alloc_local); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning forward\n"); + /* Plan parallel forward FFT */ + plan_forw = pfft_plan_dft(2, + n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning backward\n"); + /* Plan parallel backward FFT */ + plan_back = pfft_plan_dft(2, + n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + /* Initialize input with random numbers */ + pfft_init_input_complex(2, n, local_ni, local_i_start, + in); + + truein = pfft_gather_array(2, 2, (double*) in, local_i_start, local_ni, global_ni, MPI_COMM_WORLD); + pfft_print_array(2, 2, truein, global_ni, MPI_COMM_WORLD); + pfft_free(truein); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running forward\n"); + /* execute parallel forward FFT */ + pfft_execute(plan_forw); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "as complex \n"); + + trueout = pfft_gather_array(2, 2, (double*) out, local_o_start, local_no, global_no, MPI_COMM_WORLD); + pfft_print_array(2, 2, trueout, global_no, MPI_COMM_WORLD); + pfft_free(trueout); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); + /* clear the old input */ + pfft_clear_input_complex(2, n, local_ni, local_i_start, + in); + /* execute parallel backward FFT */ + pfft_execute(plan_back); + + /* Scale data */ + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) + in[l] /= (n[0]*n[1]); + + truein = pfft_gather_array(2, 2, (double*) in, local_i_start, local_ni, global_ni, MPI_COMM_WORLD); + pfft_print_array(2, 2, truein, global_ni, MPI_COMM_WORLD); + pfft_free(truein); + + /* Print error of back transformed data */ + MPI_Barrier(MPI_COMM_WORLD); + err = pfft_check_output_complex(2, n, local_ni, local_i_start, in, comm_cart_2d); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td):\n", n[0], n[1]); + pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); + + /* free mem and finalize */ + pfft_destroy_plan(plan_forw); + pfft_destroy_plan(plan_back); + MPI_Comm_free(&comm_cart_2d); + pfft_free(in); pfft_free(out); + MPI_Finalize(); + return 0; +} From f8b62a66768708ba4dfe92df17a45f778e0c0cee Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 18 Jan 2018 10:08:38 -0800 Subject: [PATCH 23/35] remove noisy message from api. --- api/api-basic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/api/api-basic.c b/api/api-basic.c index 3f2977b..d1fa382 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -1297,7 +1297,6 @@ static void execute_transposed( { int t; - pfft_fprintf(MPI_COMM_WORLD, stderr, "executing, rnk_pm = %d\nb", rnk_pm); for(t=0; t Date: Thu, 18 Jan 2018 10:09:52 -0800 Subject: [PATCH 24/35] add the third transpose that breaks the roundtrip. But without it the single tranform was wrong anyways. --- kernel/remap_2dto1d.c | 45 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index b67d456..99ff6ee 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -248,10 +248,6 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( iblk, oblk, icomms, ocomms, local_ni, local_no); - /* n0/p0 x n1/p1 > n1/q0 x n0/p0 */ - nb = local_ni[0]; - nt = local_ni[1]; - ths->local_transp[0] = NULL; ths->local_transp[1] = NULL; ths->global_remap[0] = NULL; @@ -259,7 +255,14 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( /* plan TRANSPOSED_IN in opposite direction */ - /* n1/q0 x n0/p0 -> n0/(p0*q0) x n1 */ + /* p1 == q0 */ + + /* n0/p0 x n1/p1 > n1/q0 x n0/p0 */ + + /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) */ + + /* n1 x n0/(p0*q0) -> n0 / (p0*q0) x n1 */ + N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ N1 = local_no[1]; h1 = 1; /* n1 */ blk0 = oblk[0]; /* n0/(p0*q0) */ @@ -279,6 +282,10 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( if(~io_flag & PFFT_DESTROY_INPUT) in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ + nb = local_ni[0]; + nt = local_ni[1]; + + ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, @@ -288,8 +295,23 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( N1, N0, h1, h0, hm, blk1, blk0, comm_q0, in, out, PFFT_TRANSPOSED_OUT, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT){ + /* restore pointers to 'in_user' and 'out_user' in order to compute the first step out-of-place */ + in = in_user; + } + + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); } else { + nb = local_ni[0]; + nt = local_ni[1]; + ths->local_transp[0] = PX(plan_sertrafo)( nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, @@ -302,6 +324,19 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( N0, N1, h0, h1, hm, blk0, blk1, comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } MPI_Comm_free(&comm_q0); From 27b3b44b985fcd0b568999885ea322e4b676bce4 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 18 Jan 2018 10:10:30 -0800 Subject: [PATCH 25/35] c2c_3don3d prints the array for easier consistency checks. --- tests/simple_check_c2c_3d_on_3d.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/simple_check_c2c_3d_on_3d.c b/tests/simple_check_c2c_3d_on_3d.c index 5de88c0..934ab87 100644 --- a/tests/simple_check_c2c_3d_on_3d.c +++ b/tests/simple_check_c2c_3d_on_3d.c @@ -14,8 +14,8 @@ int main(int argc, char **argv) MPI_Comm comm_cart_3d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; n[2] = 31; - np[0] = 2; np[1] = 2; np[2] = 2; + n[0] = 4; n[1] = 4; n[2] = 2; + np[0] = 1; np[1] = 1; np[2] = 2; /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); @@ -54,7 +54,11 @@ int main(int argc, char **argv) /* clear the old input */ pfft_clear_input_complex_3d(n, local_ni, local_i_start, in); - + + double * gout = pfft_gather_array(3, 2, (double*) out, local_i_start, local_ni, n, MPI_COMM_WORLD); + pfft_print_array(3, 2, gout, n, MPI_COMM_WORLD); + pfft_free(gout); + /* execute parallel backward FFT */ pfft_execute(plan_back); From b01e24bb56259a8e2177288b6445758c5e3c0243 Mon Sep 17 00:00:00 2001 From: Michael Pippig Date: Mon, 29 Jan 2018 23:36:06 +0100 Subject: [PATCH 26/35] Minor: fix calculation of plain index (only used for generation of init data, not part of the pfft kernel) --- api/api-basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/api-basic.c b/api/api-basic.c index d1fa382..36b9d86 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -257,7 +257,7 @@ static INT plain_index( INT k=0; for(INT t=0; t Date: Mon, 29 Jan 2018 23:42:02 +0100 Subject: [PATCH 27/35] Minor: cleanup 2dto1d remap --- kernel/remap_2dto1d.c | 56 ++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index 99ff6ee..a036cb5 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -31,14 +31,6 @@ extern MPI_Comm *gdbg_comms_pm; #endif -static void factorize( - int q, - int *ptr_q0, int *ptr_q1); -static void factorize_equal( - int p0, int p1, int q, - int *ptr_q0, int *ptr_q1); - - /* TODO: think about order of in and out for T_IN */ /* TODO: implement user blocksize */ @@ -115,7 +107,7 @@ void PX(local_block_remap_2dto1d_transposed)( } } -static void free_three_comms(MPI_Comm *comms) +static void free_two_comms(MPI_Comm *comms) { const int num_comms = 2; for(int t=0; t N1 x h1 x N0/P x h0 + * with P = q0, N1 = n1, h1 = 1, N0 = n0/p0, h0 = 1 + * */ /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) * for each q0 ranks, a transpose of a matrix n1 x n0/p0, * from divided along n1, to along n0/p0 - * * */ N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ N1 = local_no[1]; h1 = 1; /* n1 */ @@ -207,8 +202,8 @@ int PX(local_size_remap_2dto1d_transposed)( } /* free communicators */ - free_three_comms(icomms); - free_three_comms(ocomms); + free_two_comms(icomms); + free_two_comms(ocomms); return mem; } @@ -225,10 +220,10 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( { int p0, p1, rnk_pm; INT nb, nt, N0, N1, h0, h1, hm, blk0, blk1; - INT local_ni[3], local_no[3]; - INT iblk[3],oblk[3]; - MPI_Comm icomms[3], ocomms[3]; - MPI_Comm comm_q0, comm_q1; + INT local_ni[2], local_no[2]; + INT iblk[2],oblk[2]; + MPI_Comm icomms[2], ocomms[2]; + MPI_Comm comm_q0; R *in=in_user, *out=out_user; /* remap only works for 2d data on 2d procmesh */ @@ -257,11 +252,10 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( /* p1 == q0 */ - /* n0/p0 x n1/p1 > n1/q0 x n0/p0 */ + /* n0/p0 x n1/p1 -> n1/q0 x n0/p0 */ /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) */ - /* n1 x n0/(p0*q0) -> n0 / (p0*q0) x n1 */ N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ N1 = local_no[1]; h1 = 1; /* n1 */ @@ -279,13 +273,11 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( /* compute in-place plans on 'out' in order to preserve inputs, * global transp is preferable outof place */ - if(~io_flag & PFFT_DESTROY_INPUT) - in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ - nb = local_ni[0]; nt = local_ni[1]; - + if(~io_flag & PFFT_DESTROY_INPUT) + in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, @@ -316,7 +308,6 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); - if(~io_flag & PFFT_DESTROY_INPUT) in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ @@ -324,11 +315,6 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( N0, N1, h0, h1, hm, blk0, blk1, comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); - ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); - nb = local_no[1]; nt = local_no[0]; @@ -343,8 +329,8 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, N0 = %td N1 = %td, blk0 = %td blk1 = %td hm=%td transp_flag %d\n", N0, N1, blk0, blk1, hm, transp_flag ); /* free communicators */ - free_three_comms(icomms); - free_three_comms(ocomms); + free_two_comms(icomms); + free_two_comms(ocomms); return ths; } @@ -459,7 +445,7 @@ void PX(default_block_size_2dto1d)( } -/* allocate array of length 3 for communicators */ +/* allocate array of length 2 for communicators */ static void split_comms_2dto1d( MPI_Comm comm_cart_2d, MPI_Comm *icomms, MPI_Comm *ocomms @@ -492,7 +478,7 @@ void PX(split_cart_procmesh_2dto1d_p0q0)( ) { int p0, q0=0; - int ndims, coords_2d[3]; + int ndims, coords_2d[2]; int dim_1d, period_1d, reorder=0; int color, key; MPI_Comm comm; From 5886b085a580062dc74ad75ccab9744fc59f09b4 Mon Sep 17 00:00:00 2001 From: Michael Pippig Date: Mon, 29 Jan 2018 23:43:53 +0100 Subject: [PATCH 28/35] Minor: edit comment --- kernel/remap_3dto2d.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index 013b56d..e95c474 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -24,11 +24,11 @@ #include "util.h" /* - * useful table of i, o, and m. + * useful table of i, o, and m. p = q0*q1 * * i : n0 / p0 x n1 / p1 x n2 / p2 - * o : n0 / (p0 * q0) x n1 / (p1 * q1) x n2 / 1 * m : n0 / p0 x n1 / (p1 * q1) x n2 / q0 + * o : n0 / (p0 * q0) x n1 / (p1 * q1) x n2 / 1 * * */ From 16544ff1a794be20073429c2cbbde193cc1c1c7d Mon Sep 17 00:00:00 2001 From: Michael Pippig Date: Mon, 29 Jan 2018 23:45:20 +0100 Subject: [PATCH 29/35] bugfix: fix order of in and out arrays --- kernel/remap_2dto1d.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index a036cb5..bf46ae6 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -296,7 +296,7 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nt = local_no[0]; ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, + nb, 1, &nt, howmany, in, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); @@ -305,7 +305,7 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nt = local_ni[1]; ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, + nb, 1, &nt, howmany, in, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); if(~io_flag & PFFT_DESTROY_INPUT) @@ -313,7 +313,7 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( ths->global_remap[1] = PX(plan_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); + comm_q0, in, out, PFFT_TRANSPOSED_IN, fftw_flags); nb = local_no[1]; nt = local_no[0]; From 21e278f1fa15dc0a0f3fe8c780893a8e0afdb7a9 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Thu, 8 Feb 2018 19:42:18 +0800 Subject: [PATCH 30/35] protect critical remap_routines with needs_remap. The old head breaks pfft-python everywhere with a crash in local_size_remap_nd_transposed --- kernel/partrafo.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/kernel/partrafo.c b/kernel/partrafo.c index b924d5f..97cd111 100644 --- a/kernel/partrafo.c +++ b/kernel/partrafo.c @@ -264,10 +264,12 @@ INT PX(local_size_partrafo)( lni_to, lis_to, lno_to, los_to); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_nd_transposed)( - rnk_n, pni_to, howmany, comm_cart, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], - lni_to, lis_to, dummy_ln, dummy_ls); - mem = MAX(mem, mem_tmp); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + mem_tmp = PX(local_size_remap_nd_transposed)( + rnk_n, pni_to, howmany, comm_cart, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], + lni_to, lis_to, dummy_ln, dummy_ls); + mem = MAX(mem, mem_tmp); + } } /* plan with transposed input */ @@ -278,10 +280,12 @@ INT PX(local_size_partrafo)( lni_ti, lis_ti, lno_ti, los_ti); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_nd_transposed)( - rnk_n, pno_ti, howmany, comm_cart, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], - dummy_ln, dummy_ls, lno_ti, los_ti); - mem = MAX(mem, mem_tmp); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + mem_tmp = PX(local_size_remap_nd_transposed)( + rnk_n, pno_ti, howmany, comm_cart, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], + dummy_ln, dummy_ls, lno_ti, los_ti); + mem = MAX(mem, mem_tmp); + } } if(pfft_flags & PFFT_SHIFTED_IN){ @@ -463,10 +467,13 @@ PX(plan) PX(plan_partrafo)( ths->serial_trafo, ths->global_remap); } else { - ths->remap_nd[0] = PX(plan_remap_nd_transposed)( - rnk_n, pni_to, howmany, comm_cart, in, out, - PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], opt_flag, io_flag, fftw_flags); - + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + ths->remap_nd[0] = PX(plan_remap_nd_transposed)( + rnk_n, pni_to, howmany, comm_cart, in, out, + PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], opt_flag, io_flag, fftw_flags); + } else { + ths->remap_nd[0] = NULL; + } /* If remap_nd exists, go on with in-place transforms in order to preserve input. */ if( (ths->remap_nd[0] != NULL) && (~io_flag & PFFT_DESTROY_INPUT) ) in = out; @@ -498,9 +505,13 @@ PX(plan) PX(plan_partrafo)( if( ~io_flag & PFFT_DESTROY_INPUT ) in = out; - ths->remap_nd[1] = PX(plan_remap_nd_transposed)( - rnk_n, pno_ti, howmany, comm_cart, out, in, - PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], opt_flag, io_flag, fftw_flags); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + ths->remap_nd[1] = PX(plan_remap_nd_transposed)( + rnk_n, pno_ti, howmany, comm_cart, out, in, + PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], opt_flag, io_flag, fftw_flags); + } else { + ths->remap_nd[1] = NULL; + } } From 184045dab9a27833f2ad7b625e7daad49e35fe46 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 14 May 2018 16:29:06 -0700 Subject: [PATCH 31/35] remove a few printfs --- kernel/remap_2dto1d.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index bf46ae6..5aa6b0a 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -157,7 +157,7 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); /* N1/P x h1 x N0 x h0 -> N1 x h1 x N0/P x h0 * with P = q0, N1 = n1, h1 = 1, N0 = n0/p0, h0 = 1 @@ -172,7 +172,7 @@ int PX(local_size_remap_2dto1d_transposed)( blk1 = iblk[1]; /* n1 / q0 */ hm = 1; /* set hm to 1 since mem will be in units of real/complex */ - pfft_fprintf(MPI_COMM_WORLD, stderr, "sizing N0 = %td N1 = %td, blk0 = %td blk1 = %td\n", N0, N1, blk0, blk1); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "sizing N0 = %td N1 = %td, blk0 = %td blk1 = %td\n", N0, N1, blk0, blk1); PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); mem_tmp = PX(local_size_global_transp)( @@ -180,7 +180,7 @@ int PX(local_size_remap_2dto1d_transposed)( mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q0); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global = %td\n", mem_tmp); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global = %td\n", mem_tmp); /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ nb = local_no[1]; @@ -190,7 +190,7 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); /* take care of transposed data ordering */ if(transp_flag & PFFT_TRANSPOSED_OUT){ @@ -263,9 +263,9 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( blk1 = iblk[1]; /* n1 / q0 */ hm = howmany * (trafo_flag & PFFTI_TRAFO_C2C ? 2 : 1); - pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_ni = %td %td\n", local_ni[0], local_ni[1]); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_ni = %td %td\n", local_ni[0], local_ni[1]); - pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_no = %td %td\n", local_no[0], local_no[1]); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_no = %td %td\n", local_no[0], local_no[1]); PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); if(transp_flag & PFFT_TRANSPOSED_IN) { @@ -326,7 +326,7 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( } MPI_Comm_free(&comm_q0); - pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, N0 = %td N1 = %td, blk0 = %td blk1 = %td hm=%td transp_flag %d\n", N0, N1, blk0, blk1, hm, transp_flag ); + // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, N0 = %td N1 = %td, blk0 = %td blk1 = %td hm=%td transp_flag %d\n", N0, N1, blk0, blk1, hm, transp_flag ); /* free communicators */ free_two_comms(icomms); @@ -342,7 +342,7 @@ static void init_blks_comms_local_size( INT *local_ni, INT *local_no ) { - int p0, p1, q0, q1; + int p0, q0; PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); From fe9732c0efb6c077bc774ed6598f0d6b0b39db4e Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 14 May 2018 16:31:56 -0700 Subject: [PATCH 32/35] follow convention to write to input if DESTROY_INPUT is set. This fixed all cases but the PADDED r2C and likely the shifted transforms. The problem with the padded r2c transforms is that somehow the last axis local_ni is not padded. It should be a trivial fix? I am lost here, because the size is actually computed outside the remap routines by partrafo.c The routines looked generic enough to support any decompositions, including 2d on 2d and 3d on 3d. --- kernel/remap_2dto1d.c | 74 +++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index 5aa6b0a..4872245 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -276,53 +276,81 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nb = local_ni[0]; nt = local_ni[1]; - if(~io_flag & PFFT_DESTROY_INPUT) - in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT) { + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + /* destroy input means the result is written in the input */ + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } ths->global_remap[0] = PX(plan_global_transp)( N1, N0, h1, h0, hm, blk1, blk0, - comm_q0, in, out, PFFT_TRANSPOSED_OUT, fftw_flags); - - if(~io_flag & PFFT_DESTROY_INPUT){ - /* restore pointers to 'in_user' and 'out_user' in order to compute the first step out-of-place */ - in = in_user; - } + comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); nb = local_no[1]; nt = local_no[0]; ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, in, 0, NULL, + nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + if(ths->local_transp[0] == NULL) { + abort(); + } + + if(ths->global_remap[0] == NULL) { + abort(); + } + + if(ths->local_transp[1] == NULL) { + abort(); + } } else { nb = local_ni[0]; nt = local_ni[1]; ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, in, 0, NULL, + nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); - if(~io_flag & PFFT_DESTROY_INPUT) - in = out; /* default: compute in-place plans on 'out' in order to preserve inputs */ + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); ths->global_remap[1] = PX(plan_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, in, out, PFFT_TRANSPOSED_IN, fftw_flags); + comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); nb = local_no[1]; nt = local_no[0]; - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); + if(~io_flag & PFFT_DESTROY_INPUT) { + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + /* destroy input means the result shall be in in*/ + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } + if(ths->local_transp[0] == NULL) { + abort(); + } + if(ths->global_remap[1] == NULL) { + abort(); + } + + if(ths->local_transp[1] == NULL) { + abort(); + } } MPI_Comm_free(&comm_q0); From 15572f26c842078269ec83483c39d865d55c20c9 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 14 May 2018 18:39:26 -0700 Subject: [PATCH 33/35] clean up the code. --- kernel/remap_2dto1d.c | 115 ++++++++++++++++++++---------------------- kernel/remap_3dto2d.c | 8 --- 2 files changed, 56 insertions(+), 67 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index 4872245..5a8c6a1 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -157,8 +157,6 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); - /* N1/P x h1 x N0 x h0 -> N1 x h1 x N0/P x h0 * with P = q0, N1 = n1, h1 = 1, N0 = n0/p0, h0 = 1 * */ @@ -172,16 +170,12 @@ int PX(local_size_remap_2dto1d_transposed)( blk1 = iblk[1]; /* n1 / q0 */ hm = 1; /* set hm to 1 since mem will be in units of real/complex */ - // pfft_fprintf(MPI_COMM_WORLD, stderr, "sizing N0 = %td N1 = %td, blk0 = %td blk1 = %td\n", N0, N1, blk0, blk1); - PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); mem_tmp = PX(local_size_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, comm_q0); mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q0); - // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global = %td\n", mem_tmp); - /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ nb = local_no[1]; nt = local_no[0]; @@ -190,8 +184,6 @@ int PX(local_size_remap_2dto1d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - // pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); - /* take care of transposed data ordering */ if(transp_flag & PFFT_TRANSPOSED_OUT){ get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, @@ -263,99 +255,104 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( blk1 = iblk[1]; /* n1 / q0 */ hm = howmany * (trafo_flag & PFFTI_TRAFO_C2C ? 2 : 1); - // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_ni = %td %td\n", local_ni[0], local_ni[1]); - - // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, local_no = %td %td\n", local_no[0], local_no[1]); PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); + /* The break up into transformations here is likely suboptimal + * but at least it gives correct result. + * + * The tricky fact is that depending on DESTROY_INPUT the result + * of this routine shall be either in 'in' or in 'out'. + * */ if(transp_flag & PFFT_TRANSPOSED_IN) { R * tmp; - /* compute in-place plans on 'out' in order to preserve inputs, - * global transp is preferable outof place */ nb = local_ni[0]; nt = local_ni[1]; if(~io_flag & PFFT_DESTROY_INPUT) { + /* preserve input means the result is written in the output */ ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); + + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); + + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); } else { /* destroy input means the result is written in the input */ ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); - } - ths->global_remap[0] = PX(plan_global_transp)( - N1, N0, h1, h0, hm, blk1, blk0, - comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); - nb = local_no[1]; - nt = local_no[0]; + nb = local_no[1]; + nt = local_no[0]; - ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); - - if(ths->local_transp[0] == NULL) { - abort(); + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); } - if(ths->global_remap[0] == NULL) { - abort(); - } - - if(ths->local_transp[1] == NULL) { - abort(); - } } else { - nb = local_ni[0]; - nt = local_ni[1]; + if(~io_flag & PFFT_DESTROY_INPUT) { + /* preserve input means the result is written in the output */ + nb = local_ni[0]; + nt = local_ni[1]; - ths->local_transp[0] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, in, out, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); - ths->global_remap[1] = PX(plan_global_transp)( - N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); - nb = local_no[1]; - nt = local_no[0]; + nb = local_no[1]; + nt = local_no[0]; - if(~io_flag & PFFT_DESTROY_INPUT) { ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); } else { - /* destroy input means the result shall be in in*/ + /* destroy input means the result is written in the input */ + nb = local_ni[0]; + nt = local_ni[1]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); + + nb = local_no[1]; + nt = local_no[0]; ths->local_transp[1] = PX(plan_sertrafo)( nb, 1, &nt, howmany, out, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); } - if(ths->local_transp[0] == NULL) { - abort(); - } - - if(ths->global_remap[1] == NULL) { - abort(); - } - - if(ths->local_transp[1] == NULL) { - abort(); - } } MPI_Comm_free(&comm_q0); - // pfft_fprintf(MPI_COMM_WORLD, stderr, "planning, N0 = %td N1 = %td, blk0 = %td blk1 = %td hm=%td transp_flag %d\n", N0, N1, blk0, blk1, hm, transp_flag ); - /* free communicators */ free_two_comms(icomms); free_two_comms(ocomms); diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index e95c474..1f47794 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -176,8 +176,6 @@ int PX(local_size_remap_3dto2d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local1 = %td\n", mem_tmp); - /* n2/(q0*q1) x n0/p0 x n1/p1 -> n2/q0 x n0/p0 x n1/(p1*q1) */ N0 = local_ni[1]; h0 = 1; N1 = local_nm[2]; h1 = local_ni[0]; @@ -191,8 +189,6 @@ int PX(local_size_remap_3dto2d_transposed)( mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q1); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global1 = %td\n", mem_tmp); - /* n2/q0 x n0/p0 x n1/(p1*q1) -> n2 x n0/(p0*q0) x n1/(p1*q1) */ N0 = local_nm[0]; h0 = local_nm[1]; N1 = local_no[2]; h1 = 1; @@ -206,8 +202,6 @@ int PX(local_size_remap_3dto2d_transposed)( mem = MAX(mem, mem_tmp); MPI_Comm_free(&comm_q0); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp global2 = %td\n", mem_tmp); - /* n2 x n0/(p0*q0) x n1/(p1*q1) -> n0/(p0*q0) x n1/(p1*q1) x n2 */ nb = local_no[2]; nt = local_no[0] * local_no[1]; @@ -216,8 +210,6 @@ int PX(local_size_remap_3dto2d_transposed)( nb, 1, &nt, howmany, trafo_flag); mem = MAX(mem, mem_tmp); - pfft_fprintf(MPI_COMM_WORLD, stderr, "mem_tmp local2 = %td\n", mem_tmp); - /* take care of transposed data ordering */ if(transp_flag & PFFT_TRANSPOSED_OUT){ get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, From bc0b03b383cdc5f13f51e1cda95ed1c52915604b Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Mon, 14 May 2018 18:46:48 -0700 Subject: [PATCH 34/35] roughly follow the heuristics in remap_3dto2d.c It appears that we shall prefer out of place global_transp as much as possible. The FFTW_PRESERVE_INPUT flag is added to signify that the transform must be preserving input; it should have been added automatically. Also expanding the code branches to make it easier to parse what is actually done in different cases. --- kernel/remap_2dto1d.c | 46 +++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c index 5a8c6a1..19f507e 100644 --- a/kernel/remap_2dto1d.c +++ b/kernel/remap_2dto1d.c @@ -266,20 +266,8 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( if(transp_flag & PFFT_TRANSPOSED_IN) { R * tmp; - nb = local_ni[0]; - nt = local_ni[1]; - if(~io_flag & PFFT_DESTROY_INPUT) { /* preserve input means the result is written in the output */ - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, out, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); - - ths->global_remap[0] = PX(plan_global_transp)( - N1, N0, h1, h0, hm, blk1, blk0, - comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); - nb = local_no[1]; nt = local_no[0]; @@ -287,17 +275,20 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); - } else { - /* destroy input means the result is written in the input */ - ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, - trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags); + + nb = local_ni[0]; + nt = local_ni[1]; ths->global_remap[0] = PX(plan_global_transp)( N1, N0, h1, h0, hm, blk1, blk0, comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + /* destroy input means the result is written in the input */ nb = local_no[1]; nt = local_no[0]; @@ -305,6 +296,19 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + + nb = local_ni[0]; + nt = local_ni[1]; + + /* keep the global out of place to make it 'faster'? */ + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, in, PFFT_TRANSPOSED_OUT, fftw_flags); + + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); } } else { @@ -337,16 +341,16 @@ remap_nd_plan PX(plan_remap_2dto1d_transposed)( ths->local_transp[0] = PX(plan_sertrafo)( nb, 1, &nt, howmany, in, out, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, - opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + opt_flag, fftw_flags); ths->global_remap[1] = PX(plan_global_transp)( N0, N1, h0, h1, hm, blk0, blk1, - comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); + comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); nb = local_no[1]; nt = local_no[0]; ths->local_transp[1] = PX(plan_sertrafo)( - nb, 1, &nt, howmany, out, in, 0, NULL, + nb, 1, &nt, howmany, in, in, 0, NULL, trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, opt_flag, fftw_flags); } From d65d0a83ee801f73295d522b116f42a06893639b Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Fri, 18 May 2018 18:50:13 -0700 Subject: [PATCH 35/35] update the name to label the 2don2d release. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 9683c4f..f044371 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha3-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha3-fftw3-2don2d],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])])