diff --git a/doc/Makefile-sphinx b/doc/Makefile-sphinx
new file mode 100644
index 0000000..33668b7
--- /dev/null
+++ b/doc/Makefile-sphinx
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PFFT.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PFFT.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/PFFT"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PFFT"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..38b441b
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,291 @@
+# -*- coding: utf-8 -*-
+#
+# PFFT documentation build configuration file, created by
+# sphinx-quickstart on Sun Sep 13 01:20:34 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.todo',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'PFFT'
+copyright = u'2015, Michael Pippig'
+author = u'Michael Pippig'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.0.8'
+# The full version, including alpha/beta/rc tags.
+release = '1.0.8'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'nature'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PFFTdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+'preamble': r"""
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{nicefrac}
+""",
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'PFFT.tex', u'PFFT Documentation',
+   u'Michael Pippig', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pfft', u'PFFT Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'PFFT', u'PFFT Documentation',
+   author, 'PFFT', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/convert.sh b/doc/convert.sh
new file mode 100644
index 0000000..242a9eb
--- /dev/null
+++ b/doc/convert.sh
@@ -0,0 +1,11 @@
+files=*.tex
+for i in $files; do
+   # cat $i | sed 's/\\code{\([^}]*\)}/{\\small \1}/g' | 
+    echo $i
+    cat preample.tex $i | sed \
+        -e 's;\hdots;\dots;g' \
+        -e 's/\\code{\([^}]*\)}/\\verb+\1+/g' \
+        | pandoc -f latex -t rst  \
+    > ${i//.tex}.rst
+done
+
diff --git a/doc/develop.rst b/doc/develop.rst
new file mode 100644
index 0000000..4d4830d
--- /dev/null
+++ b/doc/develop.rst
@@ -0,0 +1,44 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Developers Guide
+================
+
+Search and replace patterns
+---------------------------
+
+Correct alignment of pfft.h header
+
+::
+
+    %s/^\(    [^ ]\+[^\\]*\)  \\/  \1\\/g  
+
+Expand most macros of pfft.h to generate the function reference of this
+manual:
+
+::
+
+    sed -e 's/ *\\$//g' -e 's/PFFT_EXTERN //g' \
+        -e 's/PX(\([^)]*\))/pfft_\1/g' -e 's/ INT/ ptrdiff_t/g' \
+        -e 's/ R/ double/g' -e 's/ C/ pfft_complex/g' \
+        -e 's/^  //g' pfft.h > pfft.h.expanded
+
+ToDo
+====
+
+-  ``PFFT_FORWARD`` is defined as ``FFTW_FORWARD``
+
+-  ``FFTW_FORWARD`` is defined as :math:`-1`
+
+-  PFFT allows to chose between ``FFTW_FORWARD`` and ``FFTW_BACKWARD``,
+   which is not implemented by FFTW.
+
+-  Matlab uses the same sign convention, i.e., :math:`-1` for ``fft``
+   and :math:`+1` for ``ifftn``
+
+Measuring parallel run times
+----------------------------
+
+Use ``MPI_Barrier`` in front of every call to ``pfft_`` function to
+avoid unbalanced run times.
diff --git a/doc/features.rst b/doc/features.rst
new file mode 100644
index 0000000..f381495
--- /dev/null
+++ b/doc/features.rst
@@ -0,0 +1,183 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Advanced Features
+=================
+
+How to Deal with FFT Index Shifts in Parallel
+---------------------------------------------
+
+Let :math:`n\in2{{\mathbb{N}}}`. A common problem is that the index of
+the FFT input and/or output array runs between
+:math:`-\nicefrac n2,\dots,\nicefrac n2-1`, but the FFT library requires
+them to run between :math:`0,\dots,n-1`. With serial program execution
+one can easily remap the input data :math:`\hat g_k` in a way that is
+suitable for the library, i.e.,
+
+.. math:: \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\dots,n-1.
+
+Similarly, one could remap the outputs of the library :math:`f_l`,
+:math:`l=0,\cdots,n-1` in the opposite direction in order to get the
+required outputs, i.e.,
+
+.. math:: g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\dots,\nicefrac n2-1.
+
+These shifts are also known as ``fftshift`` in Matlab.
+
+However, with distributed memory these ``fftshift`` operations require
+more complex data movements and result in a global communication. For
+example, the first index of the array moves to the middle and,
+therefore, the corresponding data move to another MPI process.
+Fortunately, this communication can be avoided at the cost of little
+extra computation. At the end of the section we present two PFFT library
+functions that perform the necessary pre- and postprocessing for shifted
+input and output index sets.
+
+Shift with half the FFT size
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The special case of input shift :math:`k_s=-\nicefrac n2` and/or output
+shift :math:`l_s=-\nicefrac n2` is supported by PFFT. User can choose to
+shift the input (``PFFT_SHIFTED_IN``) and/or to shift the output
+(``PFFT_SHIFTED_OUT``).
+
+Here, we are interested in the computation of
+
+.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
+
+with :math:`n, n_i, n_o \in 2{{\mathbb{N}}}` and :math:`n>n_i`,
+:math:`n>n_o`.
+
+With an index shift of :math:`\nicefrac n2` both in :math:`k` and
+:math:`l` this equivalent to the computation of
+
+.. math::
+
+   \begin{aligned}
+     g_{(l-\nicefrac{n}{2})}
+     &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+        \hat g_{(k-\nicefrac{n}{2})} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k-\nicefrac n2)(l-\nicefrac n2)/n}}} \\
+     &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}l} 
+          \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+          \left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac n2)}\right) {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}} \\
+     &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(l-\nicefrac n2)} 
+        \underbrace{
+          \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}k}\right)}_{\hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}
+        }_{f_l}\end{aligned}
+
+for
+:math:` l=\nicefrac n2-\nicefrac{n_o}{2},\dots,\nicefrac n2 +\nicefrac{n_o}{2}-1`.
+Therefore, we get the following algorithm
+
+.. math:: f_l = \sum_{k=0}^n \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
+
+The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
+corresponds to the shifts the arrays ()
+
+[1] =1.1ex For :math:`k=0,\dots,n-1` set :math:`\hat f_k = 0`. For
+:math:`k=-\nicefrac{n_i}{2},\dots,\nicefrac{n_i}{2}-1` compute
+:math:`\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}`.
+For :math:`l=0,\dots,n-1` compute
+:math:`f_l = \sum_{k=0}^{n} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
+using PFFT. For :math:`l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1`
+compute :math:`g_l = (-1)^l f_{(l+n/2)} `.
+
+Note, that this shift implies that the library deals with pruned FFTs in
+a special way, i.e., half of the zeros are added at the beginning of the
+inputs and the other half is added at the end.
+
+Arbitrary shifts
+~~~~~~~~~~~~~~~~
+
+More general shifts must be done by the user.
+
+In a more general setting, we are interested in the computation of FFTs
+with shifted index sets, i.e., assume :math:`k_s,l_s\in{{\mathbb{Z}}}`
+and compute
+
+.. math::
+
+   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}},
+     \quad l=l_s,\dots,n_o+l_s-1\,.
+
+Because of the periodicity of the FFT this can be easily performed by
+Alg. [alg:fftshift:sub:`t`\ ranslation].
+
+[alg:fftshift:sub:`t`\ ranslation]
+
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` assign
+:math:`\hat f_k = \hat g_{(k+k_s\bmod n_i)}`. For
+:math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` assign
+:math:`g_l = f_{(l-l_s\bmod n_o)}`.
+
+However, this involves explicit data movement since the sequence of data
+changes. For a our parallel data decomposition the change of data layout
+requires data communication. A simple index shift results in the
+computation of
+
+.. math::
+
+   \begin{aligned}
+     g_{l+l_s}
+     &=
+       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} k(l+l_s)/n}}}
+       =
+       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)(l+l_s)/n}}} \\
+     &=
+       {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}{{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}\right)}_{=: \hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}\end{aligned}
+
+for all :math:`l=0,\dots,n_o-1`. The resulting
+Alg. [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at
+the price of some extra computation.
+
+[alg:fftshift:sub:`m`\ odulation]
+
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k+k_s)} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}`.
+For :math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` compute
+:math:`g_{(l+l_s)} = f_l {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}}`.
+
+The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
+corresponds to the shifts the arrays ()
+
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac{n_i}{2})n_o/n}`.
+For :math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` compute
+:math:`g_{(l-\nicefrac{n_o}{2})} = f_l {{{\mathrm{e}}}}^{+\pi{{\text{i}}}n_i l/n}`.
+
+Parallel pruned FFT
+-------------------
+
+Within PFFT we define a pruned FFT as
+
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1.
+
+Formally, this is equivallent to the following regular size :math:`n`
+FFT
+
+.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n,
+
+with
+
+.. math::
+
+   \hat g_k := 
+     \begin{cases}
+     \hat f_k, &: k=0,\dots,n_1-1, \\
+     0         &: k=n_i,\dots,n-1,    
+     \end{cases}
+
+and :math:`f_l := g_l`, :math:`k=0,\dots,n_o-1`. I.e., we add
+:math:`n-n_i` zeros at the end of the input array and throw away
+:math:`n-n_o` entries at the end of the output array.
+
+The definition of pruned FFT changes for ``PFFT_SHIFTED_IN`` and
+``PFFT_SHIFTED_OUT``.
diff --git a/doc/fortran.rst b/doc/fortran.rst
new file mode 100644
index 0000000..864aa83
--- /dev/null
+++ b/doc/fortran.rst
@@ -0,0 +1,8 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Fotran Interface
+================
+
+based on Fortran 90
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..2e0d8e2
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,30 @@
+.. PFFT documentation master file, created by
+   sphinx-quickstart on Sun Sep 13 01:20:34 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PFFT's documentation!
+================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   intro
+   tutorial
+   install
+   features
+   interface
+   reference
+   develop
+
+
+
+Indices and tables
+==================
+
+* ::ref:`genindex`
+* ::ref:`modindex`
+* ::ref:`search`
+
diff --git a/doc/install.rst b/doc/install.rst
new file mode 100644
index 0000000..7eb1370
--- /dev/null
+++ b/doc/install.rst
@@ -0,0 +1,151 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Installation and linking
+========================
+
+The install of PFFT is based on the Autotools and follows the typical
+workflow
+
+::
+
+    ./configure
+    make
+    make install
+
+Install of the latest official FFTW release
+-------------------------------------------
+
+PFFT depends on Release 3.3.3 of the FFTW library . For the sake of
+completeness, we show the command line based install procedure in the
+following. However, note that we provide install scripts on
+`{www.tu-chemnitz.de/~mpip}/software.php <{www.tu-chemnitz.de/~mpip}/software.php>`__that
+simplify the install a lot. We highly recommend to use these install
+scripts, since they additionally apply several performance patches and
+bugfixes that have been submitted to the FFTW developers but are not yet
+included in the official FFTW releases.
+
+::
+
+    wget http://www.fftw.org/fftw-§\fftwversionsl§.tar.gz
+    tar xzvf fftw-§\fftwversion§.tar.gz
+    cd fftw-§\fftwversion§
+    ./configure --enable-mpi --prefix=$HOME/local/fftw3_mpi §\label{lst:fftw:conf}§
+    make
+    make install
+
+The MPI algorithms of FFTW must be build with a MPI C compiler. Add the
+statement ``MPICC=\$MPICCOMP`` at the end of line [lst:fftw:conf] if the
+``configure`` script fails to determine the right MPI C compiler
+``\$MPICCOMP``. Similarly, the MPI Fortran compiler ``\$MPIFCOMP`` is
+set by ``MPIFC=\$MPIFCOMP``.
+
+Install of the PFFT library
+---------------------------
+
+In the simplest case, the hardware platform and the -3.3.3 library are
+recognized by the PFFT configure script automatically, so all we have to
+do is
+
+::
+
+    wget http://www.tu-chemnitz.de/~mpip/software/pfft-§\pfftversionsl§.tar.gz
+    tar xzvf pfft-§\pfftversion§.tar.gz
+    cd pfft-§\pfftversion§
+    ./configure
+    make
+    make check
+    make install
+
+Hereby, the optional call ``make check`` builds the test programs. If
+the -3.3.3 software library is already installed on your system but not
+found by the configure script, you can provide the FFTW installation
+directory ``\$FFTWDIR`` to configure by
+
+.. code:: bash
+
+    ./configure --with-fftw3=$FFTWDIR
+
+This call implies that the FFTW header files are located in
+``\$FFTWDIR/include`` and the FFTW library files are located in
+``\$FFTWDIR/lib``. Otherwise, one should specify the FFTW include path
+``\$FFTWINC`` and the FFTW library path ``\$FFTWLIB`` separately by
+
+::
+
+    ./configure --with-fftw3-includedir=$FFTWINC --with-fftw3-libdir=$FFTWLIB
+
+At the end, this is equivalent to
+
+::
+
+    ./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB
+
+which is more common to experienced users of the Autotools. To install
+PFFT in a user specified directory ``\$PFFTINSTDIR`` call configure with
+the option
+
+::
+
+    ./configure --prefix=$PFFTINSTDIR
+
+However, this option is mandatory whenever you do not have root
+permissions on your machine, since the default install paths of
+``configure`` are not accessible by standard users. The PFFT library
+must be built with a MPI compiler. In Section [sec:fftw\ :sub:`i`\ nst]
+we already described how to hand the right compilers to the
+``configure`` script. Some more options are
+
+:code:\`[\`keywords=]–enable-float: Produces a single-precision version
+of PFFT (float) instead of the default double-precision (double); see
+[sec:prec].
+
+:code:\`[\`keywords=]–enable-long-double: Produces a long-double
+precision version of PFFT (long double) instead of the default
+double-precision (double); see [sec:prec].
+
+``--disable-fortran``: Disables inclusion of Fortran wrapper routines in
+the standard PFFT libraries.
+
+``--disable-tests``: Disables build of test programs.
+
+For more details on the options of the ``configure`` script call
+
+::
+
+    ./configure --help
+
+How to include PFFT in your program
+-----------------------------------
+
+All programs using PFFT should include its header file
+
+::
+
+    #include <pfft.h>
+
+This header includes the FFTW headers ``fftw.h``, ``fftw-mpi.h``
+automatically. Make sure that the compiler can find them by setting the
+include flags appropriately. You must also link to the PFFT, FFTW and
+FFTW-MPI libraries. On Unix, this means adding
+``-lpfft -lfftw3_mpi -lfftw3 -lm`` at the end of the link command. For
+example, to build ``pfft_test.c`` use the following compiler invocation
+
+::
+
+    mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm
+
+Substitute ``mpicc`` by any other MPI C compiler if you like.
+``\$PFFTINC``, ``\$FFTWINC``, ``\$PFFTLIB``, and ``\$FFTWLIB`` denote
+the PFFT and FFTW include and library paths, respectively. If you use
+the install scripts mentioned in Sect. [sec:pfft-inst], these paths will
+be
+
+::
+
+    PFFTINC = $HOME/local/pfft-§\pfftversion§/include
+    FFTWINC = $HOME/local/fftw-§\fftwversion§/include
+    PFFTINC = $HOME/local/pfft-§\pfftversion§/lib
+    FFTWINC = $HOME/local/fftw-§\fftwversion§/lib
+
diff --git a/doc/interface.rst b/doc/interface.rst
new file mode 100644
index 0000000..ff25f73
--- /dev/null
+++ b/doc/interface.rst
@@ -0,0 +1,126 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Interface Layers of the PFFT Library
+====================================
+
+We give a quick overview of the PFFT interface layers in the order of
+increasing flexibility at the example of c2c-FFTs. For r2c-, c2r-, and
+r2r-FFT similar interface layer specifications apply. A full reference
+list of all PFFT functions is given in Chapter [chap:ref].
+
+Basic Interface
+---------------
+
+The ``_3d`` interface is the simplest interface layer. It is suitable
+for the planning of three-dimensional FFTs.
+
+::
+
+    ptrdiff_t pfft_local_size_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart,
+        int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_dft_3d(
+        const ptrdiff_t *n,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and
+``local_o_start`` are ``ptrdiff_t`` arrays of length ``3``.
+
+The basic interface generalizes the ``_3d`` interface to FFTs of
+arbitrary dimension ``rnk_n``.
+
+::
+
+    ptrdiff_t pfft_local_size_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_dft(
+        int rnk_n, const ptrdiff_t *n,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Therefore, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and
+``local_o_start`` become arrays of length ``rnk_n``.
+
+Advanced Interface
+------------------
+
+The advanced interface introduces the arrays ``ni`` and ``no`` of length
+``rnk_n`` that give the pruned FFT input and output size. Furthermore,
+the arrays ``iblock`` and ``oblock`` of length ``rnk_pm`` (``rnk_pm``
+being the dimension of the process mesh) serve to adjust the block size
+of the input and output block decomposition. The additional parameter
+``howmany`` gives the number of transforms that will be computed
+simultaneously.
+
+::
+
+    ptrdiff_t pfft_local_size_many_dft(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_many_dft(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Preliminary: Skip Serial Transformations
+----------------------------------------
+
+The ``_skipped`` interface extends the ``_many`` interface by adding the
+possibility to skip some of the serial FFTs.
+
+::
+
+    pfft_plan pfft_plan_many_dft_skipped(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        (red@const int *skip_trafos,@*)
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Hereby, ``skip_trafos`` is an ``int`` array of length ``rnk_pm``\ 1+
+(``rnk_pm`` being the mesh dimension of the communicator ``comm_cart``).
+For ``t=0,...,rnk_pm`` set ``skip_trafos[t]=1`` if the ``t``-th serial
+transformation should be computed, otherwise set ``skip_trafos[t]=0``.
+Note that the local transpositions are always performed, since they are
+a prerequisite for the global communication to work. At the moment it is
+only possible to skip the whole serial transform along the last
+``rnk_n-rnk_pm-1`` dimensions. However, this behaviour can be realized
+by a call of a ``(rnk_pm``\ 1)+-dimensional PFFT with
+
+::
+
+    for(int t=rnk_pm+1; t<rnk_n; t++)
+      howmany *= n[t];
+
+and manual computation of the desired serial transforms along the last
+``rnk_n-rnk_pm-1`` dimensions.
diff --git a/doc/intro.rst b/doc/intro.rst
new file mode 100644
index 0000000..9f03204
--- /dev/null
+++ b/doc/intro.rst
@@ -0,0 +1,158 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Introduction
+============
+
+A popular software library for computing FFTs is FFTW . This library
+also includes a parallel FFT implementation (FFTW-MPI) based on the
+Message Passing Interface (MPI). FFTW-MPI parallelizes multi-dimensional
+FFTs by a mixture of serial lower-dimensional FFTs and parallel data
+transpositions. However, FFTW-MPI makes use of a one-dimensional data
+decomposition, which shows to be a scalability bottleneck on large
+scale, parallel computers. For example, a three-dimensional FFT of size
+:math:`1024^3` can be computed with at most :math:`1024` MPI processes.
+In contrast, using a two-dimensional data decomposition would increase
+the maximum number of MPI processes to :math:`1024^2` in this case.
+
+The main goal of PFFT is to extend the MPI part of the FFTW software
+library to multi-dimensional data decompositions, i.e.,
+:math:`d`-dimensional FFTs of size :math:`N^d` can be computed in
+parallel with at most :math:`N^{d-1}` MPI processes. In addition, PFFT
+offers several extra features that are particular usefull for parallel,
+distributed memory FFTs but are not yet present in FFTW-MPI. We refer to
+the publication  for a closer look on the different data decompositions
+and the underlying algorithms of the PFFT library.
+
+The interface of PFFT is as close as possible to the FFTW-MPI interface.
+In fact, we consider every difference between PFFT and FFTW that is not
+explicitly mentioned within this manual as a bug that should be reported
+to https://github.com/mpip/pfft.git. Therefore, porting code that uses
+FFTW-MPI to PFFT is almost trivial, e.g. see Section [sec:porting].
+
+Most features of PFFT are inherited from FFTW or similarily implemented.
+These include the following:
+
+We employ fast :math:`\mathcal{O}(N\log N)` algorithms of FFTW to
+compute arbitrary-size discrete Fourier transforms of complex data, real
+data, and even- or odd-symmetric real data.
+
+The dimension of the FFT can be arbitrary. However, parallel data
+decomposition must be at least one dimension smaller.
+
+PFFT offers portable performance; e.g., it will perform well on most
+platforms.
+
+The application of PFFT is split into a time consuming planning step and
+a high performance execution step.
+
+Installing the library is easy. It is based on the common sequence of
+configure, make, and make install.
+
+The interface of PFFT is very close to the MPI interface of FFTW. In
+fact, we tried to add as few extra parameters as possible.
+
+PFFT is written in C but also offers a Fortran interface, see
+Section [sec:fortran].
+
+FFTW includes shared memory parallelism for all serial transforms. This
+enables us to benefit from hybrid parallelism to a certain amount, see
+Section [sec:openmp].
+
+All steps of our parallel FFT can be performed completely in place. This
+is especially remarkable for the global transposition routines.
+
+Confirming to good MPI programming practice, all PFFT transforms can be
+performed on user defined communicators. In other words, PFFT does not
+enforce the user to work with ``MPI_COMM_WORLD``.
+
+PFFT uses the same algorithm to compute the size of the local array
+blocks as FFTW. This implies that the FFT size need not be divisible by
+the number of processes.
+
+PFFT supports single, double and long double precision.
+
+PFFT supports new-array execution, i.e., a PFFT plan can be planned and
+executed on different plans up to some restrictions, see
+Section [sec:new-array] for details. Thanks to Yu Feng for the new-array
+execute patch.
+
+Furthermore, we added some special features to support repeated tasks
+that often occur in practical application of parallel FFTs.
+
+PFFT includes a very flexible ghost cell exchange module. A detailed
+description of this module is given in Section [sec:gc].
+
+PFFT accepts three-dimensional data decomposition even for
+three-dimensional FFTs. However, the underlying parallel FFT framework
+is still based on two-dimensional decomposition. A more detailed
+description can be found in Section [sec:3don2d].
+
+PFFT explicitly supports the parallel calculation of pruned FFTs.
+Details are given in Section [sec:pruned].
+
+Finally, we complete this overview with a list of features that are (not
+yet) implemented in PFFT.
+
+Parallel one-dimensional FFT based on MPI. FFTW-MPI uses another
+parallelization strategy for one-dimensional FFTs, which is not
+implemented in PFFT. The reason is that we can not achive a scalability
+benefit due to higher dimensional data decomposition if the FFT has only
+one dimension. Therefore, one can also call FFTW directly in this case.
+
+There is no equivalent of FFTW *wisdom* in PFFT, i.e., you can not save
+a PFFT plan to disk and restore it for later use.
+
+PFFT does not have full OpenMP support. All serial FFT computations and
+global communications are implemented with FFTW, which offers OpenMP
+support, see Section [sec:openmp]. However, most of the PFFT-only
+features, such as pruned FFT, ghost cell send and 3d decompostion of 3d
+FFTs are not yet parallelized with OpenMP.
+
+PFFT does not have full SIMD support. All serial FFT computations and
+global communications are implemented with FFTW, which offers SIMD
+support, see Section [sec:simd]. However, most of the PFFT-only
+features, such as pruned FFT, ghost cell send and 3d decompostion of 3d
+FFTs are not yet parallelized with SIMD.
+
+PFFT does not overlap communication and computation. The code of PFFT is
+build in a very modularized structure. Most of these modules consist of
+FFTWs routines. Therefore, the global transposition does not support non
+blocking communication.
+
+Similar to FFTW, we do not provide any parallel IO routines. The user is
+responsible of load and store of parallel data.
+
+PFFT depends on FFTW to perform its serial transforms and does not
+support different vendor FFTs (such as Intel’s MKL or IBM’s ESSL).
+However, this is not assumed to be a big drawback, since FFTW seems to
+perform very well on most platforms.
+
+The global communication routines can not be called separately. However,
+it should be possible to implement a user interface to our global
+transposition routines.
+
+PFFT does not support GPU parallelization.
+
+You are welcome to propose new PFFT features at
+https://github.com/mpip/pfft.git.
+
+Alternative parallel FFT implementations
+----------------------------------------
+
+There have been several FFT implementations that aim to circumvent the
+scalability bottleneck for at least three dimensional FFTs by using
+two-dimensional decomposition approach. However, these implementations
+are often fitted to special problems and where not published as a stand
+alone software library. Remarkable exceptions are the parallel FFT
+software library by S. Plimpton , the P3DFFT software library by
+D. Pekurovsky  and the software library by N. Li .
+
+Parallel nonequispaced FFT
+--------------------------
+
+If your are interested in a parallel implementation of nonequispaced
+fast Fourier transforms (NFFT) for distributed memory architectures, you
+should have a look at our PNFFT software library  that is also available
+at https://github.com/mpip/pnfft.git.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..d41f63f
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,263 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PFFT.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PFFT.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end
diff --git a/doc/preample.tex b/doc/preample.tex
new file mode 100644
index 0000000..a5d548b
--- /dev/null
+++ b/doc/preample.tex
@@ -0,0 +1,105 @@
+
+\usepackage[english]{babel} % wordbreaks
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage[fixlanguage]{babelbib} % easily change bib language
+
+%% Vector fonts for PDF
+%% \usepackage{ae} % deprecated package, use lmodern instead
+\usepackage{lmodern} %standard latex font
+
+%\usepackage{makeidx} % automatic index generation, required for nomencl.sty
+%\usepackage{nomencl} % important symbols in a table at the beginning of document
+
+%% AMSMath-packages
+\usepackage{amsmath}
+\usepackage{amsthm}
+\usepackage{amssymb}
+% \usepackage{amsrefs}
+% \usepackage{textcmds}
+\usepackage{exscale} % Correct font scaling in formulas
+
+% \usepackage{subfig}
+% \usepackage{graphicx} % Graphics for figures
+\usepackage[svgnames, table, hyperref]{xcolor} %
+\usepackage{paralist} % compact itemize, enumerate, ...
+\usepackage{listings} % source code in LaTeX
+\usepackage{multirow} % combine multiple rows in arrays
+\usepackage{rotating}
+
+%% improvements of LaTeX enviroments
+\usepackage{scrhack} % avoid warning of scrreprt when loading float package
+% \usepackage{float}
+% \usepackage{verbatim}
+% \usepackage{array}
+
+% \usepackage{url} % provides \url command for bibtex
+\usepackage{hyperref} % provides \url command for bibtex and links to jump within documents
+\hypersetup{plainpages=false, colorlinks, linkcolor=black, citecolor=black, urlcolor=blue,
+pdftitle={PFFT User Manual},
+pdfauthor={Michael Pippig}, pdfstartview={FitBH}}
+
+%% adjust numbering
+\numberwithin{figure}{chapter}
+\numberwithin{table}{chapter}
+\numberwithin{equation}{chapter}
+
+%% activate for algorithms
+\usepackage{algpseudocode}
+\usepackage[chapter]{algorithm}
+\usepackage{algorithmicx}
+% \floatname{algorithm}{Algorithmus} % use german title for algorithms
+% \numberwithin{algorithm}{chapter}
+
+\usepackage{xspace}
+\usepackage{nicefrac}
+
+\usepackage{todonotes}
+
+%% activate for compact page layout
+% \usepackage{geometry}
+% \geometry{top=30.4mm, left=30.4mm, text={155mm,240mm}, headheight=10mm, headsep=5mm, includemp, marginparwidth=15.4mm}
+
+%% activate for headline with chapter information on every page
+\usepackage{scrpage2}
+\pagestyle{scrheadings}
+\automark{chapter}
+\clearscrheadings
+\lehead{\pagemark}
+\rehead{\leftmark}
+\rohead{\pagemark}
+\lohead{\rightmark}
+\ofoot[]{}
+\cfoot[]{}
+\ifoot[]{}
+
+
+\renewcommand*{\thefootnote}{\fnsymbol{footnote}}
+
+%% the quotchap document style redeﬁnes the \chapter and \chapter* commands to
+%% create fancy chapter head pages with huge chapter numbers (possibly greyed) and
+%% provides commands for adding quotations in the upper left corner of these pages.
+% \usepackage[grey]{quotchap}
+
+
+%% very special purpose packages
+% \usepackage{faktor} % provides a symbol for factor groups
+% \usepackage{slashbox} % diagonaly divide an array field
+
+
+%% experimental
+% \usepackage[color]{showkeys} % show all reference keys
+% \definecolor{refkey}{gray}{.75}
+% \definecolor{labelkey}{gray}{0.75}
+% \usepackage{epstopdf} % include .eps files with pdflatex
+% \usepackage{marginnote}
+%\usepackage{pgf}
+%\usepackage{jkpgf}
+%\usepackage{pstricks}
+%% make pdf-indexfile for inverse search - needs compatible pdf viewer
+% \synctex=1
+% \usepackage{pdfsync} % deprecated package, use synctex instead
+
+\hyphenation{equi-spaced non-equi-spaced}
+
+\input{shortcuts.tex} % Shortcuts for math symbols
diff --git a/doc/reference.rst b/doc/reference.rst
new file mode 100644
index 0000000..08a6e39
--- /dev/null
+++ b/doc/reference.rst
@@ -0,0 +1,1641 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+PFFT Reference
+==============
+
+Files and Data Types
+--------------------
+
+You must include the PFFT header file by
+
+::
+
+    #include <pfft.h>
+
+in the preamble of each source file that calls PFFT. This header
+automatically includes ``fftw.h`` and ``fftw3-mpi.h``. Therefore, PFFT
+can use the ``fftw_complex`` data type defined in ``fftw.h``, see . Note
+that ``fftw_complex`` is defined to be the C99 native complex whenever
+``<complex.h>`` is included *before* ``<fftw.h>``, ``<fftw-mpi.h>`` and
+``<pfft.h>``. Otherwise it is defined as
+
+::
+
+    typedef double fftw_complex[2];
+
+For the sake of a clean namespace we define the wrapper data type
+``pfft_complex`` as
+
+::
+
+    typedef fftw_complex pfft_complex;
+
+that can be used equivallently to ``fftw_complex``. Futhermore, we
+define the wrapper functions
+
+::
+
+    void *pfft_malloc(size_t n);
+    double *pfft_alloc_real(size_t n);
+    pfft_complex *pfft_alloc_complex(size_t n);
+    void pfft_free(void *p);
+
+as substitues for their corresponding FFTW equivalents, see . Note that
+memory allocated by one of these functions must be freed with
+``pfft_free`` (or its equivalent ``fftw_free``). Because of the
+performance reasons given in  we recommend to use one of the ``pfft_``
+(or its equivalent ``fftw_``) allocation functions for all arrays
+containing FFT inputs and outputs. However, PFFT will also work
+(possibly slower) with any other memory allocation method.
+
+Different precisions are handled as in FFTW: That is ``pfft_`` functions
+and datatypes become ``pfftf_`` (single precision) or ``pfftl_`` (long
+double precision) prefixed. Quadruple precision is not yet supported.
+The main problem is that we do not know about a suitable MPI datatype to
+represent ``__float128``.
+
+MPI Initialization
+------------------
+
+Initialization and cleanup of PFFT in done in the same way as for
+FFTW-MPI, see . In order to keep a clean name space, PFFT offers the
+wrapper functions
+
+::
+
+    void pfft_init(void);
+    void pfft_cleanup(void);
+
+that can be used as substitutes for ``fftw_mpi_init`` and
+``fftw_mpi_cleanup``, respectively.
+
+Using PFFT Plans
+----------------
+
+PFFT follows exactly the same workflow as FFTW-MPI. A plan created by
+one of the functions given in Section [sec:create-plan] is executed with
+
+::
+
+    void pfft_execute(const pfft_plan plan);
+
+and freed with
+
+::
+
+    void pfft_destroy_plan(const pfft_plan plan);
+
+Note, that you can *not* apply ``fftw_mpi_execute`` or ``fftw_destroy``
+on PFFT plans.
+
+The new array execute functions are given by
+
+::
+
+    void pfft_execute_dft(const pfft_plan plan, pfft_complex *in, pfft_complex *out);
+    void pfft_execute_dft_r2c(const pfft_plan plan, double *in, pfft_complex *out);
+    void pfft_execute_dft_c2r(const pfft_plan plan, pfft_complex *in, double *out);
+    void pfft_execute_r2r(const pfft_plan plan, double *in, double *out);
+
+The arrays given by ``in`` and ``out`` must have the correct size and
+the same alignement as the array that were used to create the plan, just
+as it is the case for FFTW, see [fftw-new-array].
+
+Data Distribution Functions
+---------------------------
+
+Complex-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex input/output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments:
+
+``rnk_n`` is the rank of the transform (typically the size of the arrays
+``n``, ``ni``, ``no``) that can be any integer :math:`\ge 2`. The
+``_3d`` planner corresponds to a ``rnk_n`` of 3.
+
+The array ``n`` of size ``rnk_n`` specifies the transform dimensions.
+They can be any positive integer.
+
+The array ``ni`` of size ``rnk_n`` specifies the input array dimensions.
+They can be any positive integer with ``ni[t] <= n[t]`` for all
+dimensions ``t=0,...,rnk_n-1``. For ``ni[t]<n[t]`` the inputs will be
+padded with zeros up to size ``n[t]`` along the ``t``-th dimension
+before the transform, see Section [sec:pruned-fft].
+
+The array ``no`` of size ``rnk_n`` specifies the output array
+dimensions. They can be any positive integer with ``no[t] <= n[t]`` for
+all dimensions ``t=0,...,rnk_n-1``. For ``no[t]<n[t]`` the outputs will
+be pruned to size ``no[t]`` along the ``t``-th dimension after the
+transform, see Section [sec:pruned-fft].
+
+``howmany`` is the number of transforms to compute. The resulting plan
+computes howmany transforms, where the input of the k-th transform is at
+location in+k (in C pointer arithmetic) with stride ``howmany``, and its
+output is at location out+k with stride ``howmany``. The basic
+``pfft_plan_dft`` interface corresponds to howmany=1.
+
+``comm_cart`` is a Cartesian communicator of dimension ``rnk_pm`` that
+specifies the parallel data decomposition, see
+Section [sec:data-decomp]. Most of the time, PFFT requires
+``rnk_pm < rnk_n``. The only exception is the case
+``rnk_pm == rnk_n == 3``, see Section [sec:3don3d]. If an ordinary (i.e.
+non-Cartesian) communicator is passed, PFFT internally converts it into
+a one-dimensional Cartesian communicator while retaining the MPI ranks
+(this results in the FFTW-MPI data decomposition).
+
+The arrays ``iblock`` and ``oblock`` of size ``rnk_pm``\ 1+ specify the
+block sizes for the first ``rnk_pm``\ 1+ dimensions of the input and
+output data, respectively. These must be the same block sizes as were
+passed to the corresponding ``local_size`` function. You can pass
+``PFFT_DEFAULT_BLOCKS`` to use PFFT’s default block sizes. Furthermore,
+you can use ``PFFT_DEFAULT_BLOCK`` to set the default block size in
+separate dimensions, e.g., ``iblock[t]=PFFT_DEFAULT_BLOCK``.
+
+``pfft_flags`` is a bitwise OR (’``|``\ ’) of zero or more planner
+flags, as defined in Section [sec:flags].
+
+The array ``local_ni`` of size ``rnk_n`` returns the size of the local
+input array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_i_start`` of size ``rnk_n`` returns the offset of the
+local input array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_no`` of size ``rnk_n`` returns the size of the local
+output array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_o_start`` of size ``rnk_n`` returns the offset of the
+local output array block in every dimension (counted in units of complex
+numbers).
+
+In addition, the following ``local_block`` functions compute the local
+data distribution of the process with MPI rank ``pid``. The
+``local_size`` interface can be understood as a call of ``local_block``
+where ``pid`` is given by ``MPI_Comm_rank(comm_cart, &pid)``, i.e., each
+MPI process computes its own data block. However, ``local_block``
+functions have a ``void`` return type, i.e., they omit the computation
+of the local array size that is necessary to hold the parallel
+transform. This makes ``local_block`` functions substantially faster in
+exectuion.
+
+::
+
+    void pfft_local_block_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Real-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_r2c_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft_r2c(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, real-input/complex-output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+The logical input array size ``ni`` will differ from the physical array
+size of the real inputs if the flag ``PFFT_PADDED_R2C`` is included in
+``pfft_flags``. This results from the padding at the end of the last
+dimension that is necessary to align the real valued inputs and complex
+valued outputs for inplace transforms, see . In contrast to FFTW-MPI,
+PFFT does not pad the r2c inputs per default.
+
+``local_ni`` is counted in units of real numbers. It will include
+padding
+
+``local_i_start`` is counted in units of real numbers.
+
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
+
+::
+
+    void pfft_local_block_dft_r2c_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_r2c(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Complex-to-Real FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_c2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft_c2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex-input/real-output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+The logical output array size ``no`` will differ from the physical array
+size of the real outputs if the flag ``PFFT_PADDED_C2R`` is included in
+``pfft_flags``. This results from the padding at the end of the last
+dimension that is necessary to align the real valued outputs and complex
+valued inputs for inplace transforms, see . In contrast to FFTW-MPI,
+PFFT does not pad the c2r outputs per default.
+
+``local_no`` is counted in units of real numbers.
+
+``local_o_start`` is counted in units of real numbers.
+
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
+
+::
+
+    void pfft_local_block_dft_c2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_c2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Real-to-Real FFT
+~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_r2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_r2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_r2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex input/output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *real* numbers that must be allocated to hold the parallel
+transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+``local_ni`` is counted in units of real numbers.
+
+``local_i_start`` is counted in units of real numbers.
+
+``local_no`` is counted in units of real numbers.
+
+``local_o_start`` is counted in units of real numbers.
+
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
+
+::
+
+    void pfft_local_block_r2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_r2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_r2r(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Plan Creation
+-------------
+
+Complex-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_3d(
+        const ptrdiff_t *n, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft(
+        int rnk_n, const ptrdiff_t *n, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, complex input/output discrete Fourier transform (DFT)
+in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
+
+Arguments:
+
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft`` function, see Section [sec:local-size-c2c].
+
+The array ``skip_trafos`` of size ``rnk_pm``\ 1+ specifies the serial
+transforms that will be omitted. For ``t=0,...,rnk_pm`` set
+``skip_trafos[t]=1`` if the ``t``-th serial transformation should be
+computed, otherwise set ``skip_trafos[t]=0``, see
+Section [sec:skip-trafo] for more details.
+
+``in`` and ``out`` point to the complex valued input and output arrays
+of the transform, which may be the same (yielding an in-place
+transform). These arrays are overwritten during planning, unless
+``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The arrays need
+not be initialized, but they must be allocated.)
+
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``).
+
+``pfft_flags`` is a bitwise OR (’``|``\ ’) of zero or more planner
+flags, as defined in Section [sec:flags].
+
+PFFT computes an unnormalized transform: computing a forward followed by
+a backward transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the dimensions
+``n[t]``).
+
+Real-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_r2c_3d(
+        const ptrdiff_t *n, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_r2c_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, real-input/complex-output discrete Fourier transform
+(DFT) in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
+
+Arguments:
+
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft_r2c`` function, see Section [sec:local-size-r2c].
+
+``in`` and ``out`` point to the real valued input and complex valued
+output arrays of the transform, which may be the same (yielding an
+in-place transform). These arrays are overwritten during planning,
+unless ``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The
+arrays need not be initialized, but they must be allocated.)
+
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``). Note that this parameter is not part of the FFTW-MPI
+interface, where r2c transforms are defined to be forward transforms.
+However, the backward transform can be easily realized by an additional
+conjugation of the complex outputs as done by PFFT.
+
+Complex-to-Real FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_c2r_3d(
+        const ptrdiff_t *n, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_c2r_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, complex-input/real-output discrete Fourier transform
+(DFT) in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
+
+Arguments:
+
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft_c2r`` function, see Section [sec:local-size-c2r].
+
+``in`` and ``out`` point to the complex valued input and real valued
+output arrays of the transform, which may be the same (yielding an
+in-place transform). These arrays are overwritten during planning,
+unless ``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The
+arrays need not be initialized, but they must be allocated.)
+
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``). Note that this parameter is not part of the FFTW-MPI
+interface, where c2r transforms are defined to be backward transforms.
+However, the forward transform can be easily realized by an additional
+conjugation of the complex inputs as done by PFFT.
+
+Real-to-Real FFT
+~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_r2r_3d(
+        const ptrdiff_t *n, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_r2r(
+        int rnk_n, const ptrdiff_t *n, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_r2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_r2r_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+
+Plan a parallel, real input/output (r2r) transform in two or more
+dimensions, returning an ``pfft_plan``. The planner returns NULL if the
+plan cannot be created.
+
+Arguments:
+
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_r2r`` function, see Section [sec:local-size-r2r].
+
+``in`` and ``out`` point to the real valued input and output arrays of
+the transform, which may be the same (yielding an in-place transform).
+These arrays are overwritten during planning, unless
+``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The arrays need
+not be initialized, but they must be allocated.)
+
+The array ``kinds`` of length ``rnk_n`` specifies the kind of r2r
+transform that is computed in the corresponding dimensions. Just like
+FFTW-MPI we compute the separable product formed by taking each
+transform kind along the corresponding dimension, one dimension after
+another.
+
+FFT Execution Timer
+-------------------
+
+PFFT offers an easy way to perform run time measurements and print/write
+the results.
+
+Basis Run Time Measurements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PFFT-plans automatically accumulate the local run times of every call to
+``pfft_execute``. For most applications it is sufficient to print run
+time of a plan ``ths`` averaged over all runs with
+
+::
+
+    void pfft_print_average_timer(
+        const pfft_plan ths, MPI_Comm comm);
+
+Note, that for each timer the maximum time over all processes is reduced
+to rank ``0`` of communicator ``comm``, i.e., a call to ``MPI_Reduce``
+is performed and the output is only printed on this process. The
+following function works in the same way but prints more verbose output
+
+::
+
+    void pfft_print_average_timer_adv(
+        const pfft_plan ths, MPI_Comm comm);
+
+To write the averaged run time of plan ``ths`` into a file called
+``name`` use
+
+::
+
+    void pfft_write_average_timer(
+        const pfft_plan ths, const char *name, MPI_Comm comm);
+    void pfft_write_average_timer_adv(
+        const pfft_plan ths, const char *name, MPI_Comm comm);
+
+Again, the output is only written on rank ``0`` of communicator
+``comm``.
+
+Discard all the recorded run times with
+
+::
+
+    void pfft_reset_timer(
+        pfft_plan ths);
+
+This function is called per default at the end of every PFFT plan
+creation function.
+
+Advanced Timer Manipulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to access the run times directly a new typedef ``pfft_timer``
+is introduced. The following function returns a copy of the timer
+corresponding to PFFT plan ``ths``
+
+::
+
+    pfft_timer pfft_get_timer(
+        const pfft_plan ths);
+
+Note that the memory of the returned ``pfft_timer`` must be released
+with
+
+::
+
+    void pfft_destroy_timer(
+        pfft_timer ths);
+
+as soon as the timer is not needed anymore.
+
+In the following we introduce some routines to perform basic operations
+on timers. For all functions with a ``pfft_timer`` return value you must
+use ``pfft_destroy_timer`` in order to release the allocated memory of
+the timer. Create a copy of a PFFT-timer ``orig`` with
+
+::
+
+    pfft_timer pfft_copy_timer(
+        const pfft_timer orig);
+
+Compute the average, local time over all runs of a timer ``ths`` with
+
+::
+
+    void pfft_average_timer(
+        pfft_timer ths);
+
+Create a new timer that contains the sum of two timers ``sum1`` and
+``sum2`` with
+
+::
+
+    pfft_timer pfft_add_timers(
+        const pfft_timer sum1, const pfft_timer sum2);
+
+Create a timer that contains the maximum times of all the timers ``ths``
+from all processes belonging to communicator ``comm`` with
+
+::
+
+    pfft_timer pfft_reduce_max_timer(
+        const pfft_timer ths, MPI_Comm comm);
+
+Since this function calls ``MPI_Reduce``, only the first process (rank
+0) of ``comm`` will get the desired data while all the other processes
+have timers with undefined values.
+
+Note, that you can not access the elements of a timer directly, since it
+is only a pointer to a ``struct``. However, PFFT offers a routine that
+creates an array and copies all the entries of the timer into it
+
+::
+
+    double* pfft_convert_timer2vec(
+        const pfft_timer ths);
+
+Remember to use ``free`` in order to release the allocated memory of the
+returned array at the moment it is not needed anymore. The entries of
+the returned array are ordered as follows:
+
+dimension of the process mesh ``rnk_pm``
+
+number of serial trafos ``rnk_trafo``
+
+number of global remaps ``rnk_remap``
+
+number of ``pfft_execute`` runs ``iter``
+
+local run time of all runs
+
+``rnk_n`` local times of the serial trafos
+
+``rnk_remap`` local times of the global remaps
+
+2 times of the global remaps that are only necessary for
+three-dimensional FFTs on three-dimensional process meshes
+
+time for computing twiddled input (as needed for ``PFFT_SHIFTED_OUT``)
+
+time for computing twiddled output (as needed for ``PFFT_SHIFTED_IN``)
+
+The complementary function
+
+::
+
+    pfft_timer pfft_convert_vec2timer(
+        const double *times);
+
+creates a timer and fills it’s entries with the data from array
+``times``. Thereby, the entries of ``times`` must be in the same order
+as above.
+
+Ghost Cell Communication
+------------------------
+
+In the following we describe the PFFT ghost cell communication module.
+At the moment, PFFT ghost cell communication is restricted to
+three-dimensional arrays.
+
+Assume a three-dimensional array ``data`` of size ``n[3]`` that is
+distributed in blocks such that each process has a local copy of
+``data[k[0],k[1],k[2]]`` with
+
+::
+
+    local_start[t] <= k[t] < local_start[t] + local_n[t]
+
+Here and in the following, we assume ``t=0,1,2``. The “classical” ghost
+cell exchange communicates all the necessary data between neighboring
+processes, such that each process gets a local copy of
+``data[k[0],k[1],k[2]]`` with
+
+::
+
+    local_gc_start[t] <= k[t] < local_gc_start[t] + local_ngc[t]
+
+where
+
+::
+
+    local_gc_start[t] = local_start[t] - gc_below[t];
+    local_ngc[t] = local_n[t] + gc_below[t] + gc_above[t];
+
+I.e., the local array block is increased in every dimension by
+``gc_below`` elements below and ``gc_above`` elements above. Hereby, the
+``data`` is wrapped periodically whenever ``k[t]`` exceeds the array
+dimensions. The number of ghost cells in every dimension can be chosen
+independently and can be arbitrary large, i.e., PFFT ghost cell
+communication also handles the case where the requested data exceeds
+next neighbor communication. The number of ghost cells can even be
+bigger than the array size, which results in multiple local copies of
+the same data elements at every process. However, the arrays
+``gc_below`` and ``gc_above`` must be equal among all MPI processes.
+
+PFFT ghost cell communication can work on both, the input and output
+array distributions. Substitute ``local_n`` and ``local_start`` by
+``local_ni`` and ``local_i_start`` if you are interested in ghost cell
+communication of the input array. For ghost cell communication of the
+output array, substitute ``local_n`` and ``local_start`` by ``local_no``
+and ``local_o_start``.
+
+Using Ghost Cell Plans
+~~~~~~~~~~~~~~~~~~~~~~
+
+We introduce a new datatype ``pfft_gcplan`` that stores all the
+necessary information for ghost cell communication. Using a ghost cell
+plan follows the typical workflow: At first, determine the parallel data
+distribution; cf. Section [sec:gc:local-size]. Next, create a ghost cell
+plan; cf. Section [sec:gc:plan-cdata] and Section [sec:gc:plan-rdata].
+Execute the ghost cell communication with one of the following two
+collective functions
+
+::
+
+    void pfft_exchange(
+        pfft_gcplan ths);
+    void pfft_reduce(
+        pfft_gcplan ths);
+
+Hereby, a ghost cell exchange creates duplicates of local data elements
+on next neighboring processes, while a ghost cell reduce is the adjoint
+counter part of the exchange, i.e., it adds the sum of all the
+duplicates of a local data element to the original data element.
+Finally, free the allocated memory with
+
+::
+
+    void pfft_destroy_gcplan(
+        pfft_gcplan ths);
+
+if the plan is not needed anymore. Passing a freed plan to
+``pfft_exchange`` or ``pfft_reduce`` results in undefined behavior.
+
+Data Distribution
+~~~~~~~~~~~~~~~~~
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for computing the ghost cell data
+distribution:
+
+::
+
+    ptrdiff_t pfft_local_size_gc_3d(
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+    ptrdiff_t pfft_local_size_gc(
+        int rnk_n, 
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+    ptrdiff_t pfft_local_size_many_gc(
+        int rnk_n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        ptrdiff_t howmany,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+
+Hereby, ``rnk_n`` and ``howmany`` must be the exactly same variables
+that were used for the PFFT plan creation. However, only the case
+``rnk_n==3`` is completely implemented at the moment. The local array
+size ``local_n`` must be equal to ``local_ni`` or ``local_no`` (computed
+by an appropriate call of ``pfft_local_size``; cf.
+Section [sec:local-size]) depending on whether the ghost cell plan works
+on the FFT input or output array. Analogously, ``local_start`` becomes
+``local_i_start`` or ``local_o_start``. The number of ghost cells is
+given by the two arrays ``gc_below`` and ``gc_above`` that must be equal
+among all MPI processes. All the ghost cell data distribution functions
+return the local array plus ghost cell size ``local_ngc`` and the
+corresponding offset ``local_gc_start`` as two arrays of length
+``rnk_n``. In addition, the ``ptrdiff_t`` return value gives the number
+of data elements that are necessary in order to store the array plus
+ghost cells.
+
+Note, that the array distribution functions do not distinguish between
+real and complex valued data. That is because ``local_n`` and
+``local_start`` count array elements in units of complex or real
+depending on the transform. In addition, it does not matter if the local
+array is transposed or not, i.e., it is not necessary to pass the flags
+``PFFT_TRANSPOSED_IN`` and ``PFFT_TRANSPOSED_OUT`` to the ghost cell
+distribution function. In constrast, the ghost cell plan creation
+depends on the transform type as well as the transposition flags.
+
+Memory Allocation
+~~~~~~~~~~~~~~~~~
+
+In most applications we must ensure that the data array is large enough
+to suit the memory requirements of a parallel FFT and the ghost cell
+communication. The following two code snippets illustrate the correct
+allocation of memory in for complex valued and real valued arrays.
+
+::
+
+    /* Get parameters of data distribution */
+    /* alloc_local, local_no, local_o_start are given in complex units */
+    /* local_ni, local_i_start are given in real units */
+    alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
+        local_ni, local_i_start, local_no, local_o_start);
+
+    /* alloc_local_gc, local_ngc, local_gc_start are given in complex units */
+    alloc_local_gc = pfft_local_size_gc_3d(
+        local_no, local_o_start, gc_below, gc_above,
+        local_ngc, local_gc_start);
+
+    /* Allocate enough memory for FFT and ghost cells */
+    pfft_complex *cdata = pfft_alloc_complex(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local);
+
+Here, ``alloc_local`` gives the number of data elements that are
+necessary to hold all steps of the parallel FFT, while
+``alloc_local_gc`` gives the number of data elements that are necessary
+to hold all steps of the ghost cell communication. Note that we took the
+maximum of these both numbers as argument for ``pfft_alloc_complex``.
+The code snippet for real valued arrays looks very similar.
+
+::
+
+    /* Get parameters of data distribution */
+    /* alloc_local, local_no, local_o_start are given in complex units */
+    /* local_ni, local_i_start are given in real units */
+    alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
+        local_ni, local_i_start, local_no, local_o_start);
+
+    /* alloc_local_gc, local_ngc, local_gc_start are given in real units */
+    alloc_local_gc = pfft_local_size_gc_3d(
+        local_ni, local_i_start, gc_below, gc_above,
+        local_ngc, local_gc_start);
+
+    /* Allocate enough memory for FFT and ghost cells */
+    double *rdata = pfft_alloc_real(alloc_local_gc > 2*alloc_local ? alloc_local_gc : 2*alloc_local);
+
+Note that the number of real valued data elements is given by two times
+``alloc_local`` for r2c transforms, whereas the last line would change
+into
+
+::
+
+    double *rdata = pfft_alloc_real(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local);
+
+for r2r transforms.
+
+Plan Creation for Complex Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions create ghost cell plans that operate on complex
+valued arrays, i.e.,
+
+c2c inputs,
+
+c2c outputs,
+
+r2c outputs (use flag ``PFFT_GC_C2R``), and
+
+c2r inputs (use flag ``PFFT_GC_R2C``).
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for creating a complex valued ghost cell
+plan:
+
+::
+
+    pfft_gcplan pfft_plan_cgc_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_cgc(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_many_cgc(
+        int rnk_n, const ptrdiff_t *n,
+        ptrdiff_t howmany, const ptrdiff_t *block,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+
+Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the
+variables that were used for the PFFT plan creation. However, only the
+case ``rnk_n==3`` is completely implemented at the moment. Remember that
+``n`` is the logical FFT size just as it is the case for FFT planning.
+The block size ``block`` must be equal to ``iblock`` or ``oblock``
+depending on whether the ghost cell plan works on the FFT input or
+output array. Analogously, ``data`` becomes ``in`` or ``out``. Set the
+number of ghost cells by ``gc_below`` and ``gc_above`` as described in
+Section [sec:gc]. The flags ``gc_flags`` must be set appropriately to
+the flags that were passed to the FFT planner. Table [tab:map-cgcflags]
+shows the ghost cell planner flags that must be set in dependence on the
+listed FFT planner flags.
+
+[h]
+
++----------------------------+-------------------------------+
+| FFT flag                   | ghost cell flag               |
++============================+===============================+
+| ``PFFT_TRANSPOSED_NONE``   | ``PFFT_GC_TRANSPOSED_NONE``   |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_IN``     | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_OUT``    | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+
+[tab:map-cgcflags]
+
+In addition, we introduce the flag ``PFFT_GC_R2C`` (and its equivalent
+``PFFT_GC_C2R``) to handle the complex array storage format of r2c and
+c2r transforms. In fact, these two flags imply an ordinary complex
+valued ghost cell communication on an array of size
+``n[0] x ... x n[rnk_n-2] x (n[rnk_n-1]/2``\ 1)+. Please note that we
+wrongly assume periodic boundary conditions in this case. Therefore, you
+should ignore the data elements with the last index behind
+``n[rnk_n-1]/2``.
+
+Plan Creation for Real Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions create ghost cell plans that operate on real
+valued arrays, i.e.,
+
+r2r inputs,
+
+r2r outputs,
+
+r2c inputs, and
+
+c2r outputs.
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for creating a real valued ghost cell plan:
+
+::
+
+    pfft_gcplan pfft_plan_rgc_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_rgc(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_many_rgc(
+        int rnk_n, const ptrdiff_t *n,
+        ptrdiff_t howmany, const ptrdiff_t *block,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+
+Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the
+variables that were used for the PFFT plan creation. Remember that ``n``
+is the logical FFT size just as it is the case for FFT planning. The
+block size ``block`` must be equal to ``iblock`` or ``oblock`` depending
+on whether the ghost cell plan works on the FFT input or output array.
+Analogously, ``data`` becomes ``in`` or ``out``. Set the number of ghost
+cells by ``gc_below`` and ``gc_above`` as described in
+Section [sec:gc:local-size]. The flags ``gc_flags`` must be set
+appropriately to the flags that were passed to the FFT planner.
+Table [tab:map-rgcflags] shows the ghost cell planner flags that must be
+set in dependence on the listed FFT planner flags.
+
+[h]
+
++----------------------------+-------------------------------+
+| FFT flag                   | ghost cell flag               |
++============================+===============================+
+| ``PFFT_TRANSPOSED_NONE``   | ``PFFT_GC_TRANSPOSED_NONE``   |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_IN``     | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_OUT``    | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_PADDED_R2C``        | ``PFFT_GC_PADDED_R2C``        |
++----------------------------+-------------------------------+
+| ``PFFT_PADDED_C2R``        | ``PFFT_GC_PADDED_C2R``        |
++----------------------------+-------------------------------+
+
+[tab:map-rgcflags]
+
+Note that the flag ``PFFT_GC_PADDED_R2C`` (or its equivalent
+``PFFT_GC_PADDED_C2R``) implies an ordinary real valued ghost cell
+communication on an array of size
+``n[0] x ... x n[rnk_n-2] x 2*(n[rnk_n-1]/2``\ 1)+. Especially, the
+padding elements will be handles as normal data points, i.e., you must
+we aware that the numbers of ghost cells ``gc_below[rnk_n-1]`` and
+``gc_above[rnk_n-1]`` include the number of padding elements.
+
+Inofficial Flags
+~~~~~~~~~~~~~~~~
+
+Ghost Cell Execution Timer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PFFT ghost cell plans automatically accumulate the local run times of
+every call to ``pfft_exchange`` and ``pfft_reduce``. For most
+applications it is sufficient to print run time of a plan ``ths``
+averaged over all runs with
+
+::
+
+    void pfft_print_average_gctimer(
+        const pfft_gcplan ths, MPI_Comm comm);
+
+Note, that for each timer the maximum time over all processes is reduced
+to rank ``0`` of communicator ``comm``, i.e., a call to ``MPI_Reduce``
+is performed and the output is only printed on this process. The
+following function works in the same way but prints more verbose output
+
+::
+
+    void pfft_print_average_gctimer_adv(
+        const pfft_gcplan ths, MPI_Comm comm);
+
+To write the averaged run time of a ghost cell plan ``ths`` into a file
+called ``name`` use
+
+::
+
+    void pfft_write_average_gctimer(
+        const pfft_gcplan ths, const char *name, MPI_Comm comm);
+    void pfft_write_average_gctimer_adv(
+        const pfft_gcplan ths, const char *name, MPI_Comm comm);
+
+Again, the output is only written on rank ``0`` of communicator
+``comm``.
+
+Discard all the recorded run times with
+
+::
+
+    void pfft_reset_gctimers(
+        pfft_gcplan ths);
+
+This function is called per default at the end of every ghost cell plan
+creation function.
+
+In order to access the run times directly a new typedef ``pfft_timer``
+is introduced. The following functions return a copy of the timer
+corresponding to ghost cell plan ``ths`` that accumulated the time for
+ghost cell exchange or ghost cell reduce, respectively:
+
+::
+
+    pfft_gctimer pfft_get_gctimer_exg(
+        const pfft_gcplan ths);
+    pfft_gctimer pfft_get_gctimer_red(
+        const pfft_gcplan ths);
+
+Note that the memory of the returned ``pfft_gctimer`` must be released
+with
+
+::
+
+    void pfft_destroy_gctimer(
+        pfft_gctimer ths);
+
+as soon as the timer is not needed anymore.
+
+In the following we introduce some routines to perform basic operations
+on timers. For all functions with a ``pfft_gctimer`` return value you
+must use ``pfft_destroy_gctimer`` in order to release the allocated
+memory of the timer. Create a copy of a ghost cell timer ``orig`` with
+
+::
+
+    pfft_gctimer pfft_copy_gctimer(
+        const pfft_gctimer orig);
+
+Compute the average, local time over all runs of a timer ``ths`` with
+
+::
+
+    void pfft_average_gctimer(
+        pfft_gctimer ths);
+
+Create a new timer that contains the sum of two timers ``sum1`` and
+``sum2`` with
+
+::
+
+    pfft_gctimer pfft_add_gctimers(
+        const pfft_gctimer sum1, const pfft_gctimer sum2);
+
+Create a timer that contains the maximum times of all the timers ``ths``
+from all processes belonging to communicator ``comm`` with
+
+::
+
+    pfft_gctimer pfft_reduce_max_gctimer(
+        const pfft_gctimer ths, MPI_Comm comm);
+
+Since this function calls ``MPI_Reduce``, only the first process (rank
+0) of ``comm`` will get the desired data while all the other processes
+have timers with undefined values.
+
+Note, that you can not access the elements of a timer directly, since it
+is only a pointer to a ``struct``. However, PFFT offers a routine that
+creates an array and copies all the entries of the timer into it
+
+::
+
+    void pfft_convert_gctimer2vec(
+        const pfft_gctimer ths, double *times);
+
+Remember to use ``free`` in order to release the allocated memory of the
+returned array at the moment it is not needed anymore. The entries of
+the returned array are ordered as follows:
+
+number of ``pfft_execute`` runs ``iter``
+
+local run time of all runs
+
+local run time of zero padding (make room for incoming ghost cells and
+init with zeros)
+
+local run time of the ghost cell exchange or reduce (depending on the
+timer)
+
+The complementary function
+
+::
+
+    pfft_gctimer pfft_convert_vec2gctimer(
+        const double *times);
+
+creates a timer and fills it’s entries with the data from array
+``times``. Thereby, the entries of ``times`` must be in the same order
+as above.
+
+Useful Tools
+------------
+
+The following functions are useful tools but are not necessarily needed
+to perform parallel FFTs.
+
+Initializing Complex Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To fill a complex array ``data`` with reproducible, complex values you
+can use one of the functions
+
+::
+
+    void pfft_init_input_complex_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_init_input_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` of length
+``rnk_n`` (``rnk_n==3`` for ``_3d``) give the size of the FFT, the local
+array size and the local array offset as computed by the array
+distribution functions described in Section [sec:local-size] The
+functions
+
+::
+
+    double pfft_check_output_complex_3d(
+        const ptrdiff_t *n, 
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_complex_3d``,
+``pfft_init_input_complex``. In addition, we supply the following
+functions for setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_complex_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_clear_input_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Note, that these functions can be combined for a quick consistency check
+of the FFT. Since a forward FFT followed by a backward FFT reproduces
+the inputs up to a scaling factor, the following code snippet should
+give a result equal to zero up to machine precision.
+
+::
+
+    /* Initialize input with random numbers */
+    pfft_init_input_complex_3d(n, local_ni, local_i_start,
+        in);
+
+    /* execute parallel forward FFT */
+    pfft_execute(plan_forw);
+
+    /* clear the old input */
+    if(in != out) 
+      pfft_clear_input_complex_3d(n, local_ni, local_i_start, in);
+
+    /* execute parallel backward FFT */
+    pfft_execute(plan_back);
+
+    /* Scale data */
+    for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
+      in[l] /= (n[0]*n[1]*n[2]);
+
+    /* Print error of back transformed data */
+    err = pfft_check_output_complex_3d(n, local_ni, local_i_start, in, comm_cart_2d);
+    pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]);
+    pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
+
+Hereby, we set all inputs equal to zero after the forward FFT in order
+to be sure that all the final results are actually computed by the
+backward FFT instead of being a buggy relict of the forward transform.
+
+Initializing Real Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To fill a real array ``data`` with reproducible, real values use one of
+the functions
+
+::
+
+    void pfft_init_input_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_init_input_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the
+size of the FFT, the local array size and the local array offset as
+computed by the array distribution functions described in
+Section [sec:local-size] The functions
+
+::
+
+    double pfft_check_output_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_real_3d``,
+``pfft_init_input_real``. In addition, we supply the following functions
+for setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_clear_input_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Note, that both ``pfft_init_input_real*`` functions will set all array
+elements to zero were ``local_n `` local\ :sub:`ns`\ tart >= n+. In
+addition, both ``pfft_check_output_real*`` function will ignore all the
+errors resulting from these elements. Therefore, it is safe to use all
+these functions for a consistency check of a r2c transform followed by a
+c2r transform since all padding elements will be ignored.
+
+Initializing r2c/c2r Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The real inputs of a r2c transform can be initialized with the functions
+decribed in Section [sec:init-data-3d-r2r]. However, generating suitable
+inputs for a c2r transform requires more caution. In order to get real
+valued results of a DFT the complex input coefficients need to satisfy
+an radial Hermitian symmetry, i.e.,
+:math:`X[{{\boldsymbol{k}}}] = {X^*[-{{\boldsymbol{k}}}]}`. We use the
+following trick to generate the complex input values for c2r transforms.
+Assume any :math:`{{\boldsymbol{N}}}`-periodic complex valued function
+:math:`f`. It can be easily shown that the values
+:math:`X[{{\boldsymbol{k}}}] := \frac{1}{2}\left(f({{\boldsymbol{k}}})+f^*(-{{\boldsymbol{k}}})\right)`
+satisfy the radial Hermitian symmetry.
+
+To fill a complex array ``data`` with reproducible, complex values that
+fulfill the radial Hermitian symmetry use one of the functions
+
+::
+
+    void pfft_init_input_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_init_input_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the
+size of the FFT, the local array size and the local array offset as
+computed by the array distribution functions described in
+Section [sec:local-size] The functions
+
+::
+
+    double pfft_check_output_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_complex_hermitian_3d``,
+``pfft_init_input_complex_hermitian``. In addition, we supply the
+following functions for setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_clear_input_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Note, that these functions can also be used in order to generate complex
+inputs with radial Hermitian symmetry for ordinary c2c transforms. Of
+course the results of such a c2c DFT will have all imaginary parts equal
+to zero up to machine precision.
+
+Operations on Arrays of Type ``ptrdiff_t``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following routines are shortcuts for the elementwise manipulation of
+``ptrdiff_t`` valued arrays. In the following, all arrays ``vec``,
+``vec1``, and ``vec2`` are of length ``d`` and type ``ptrdiff_t``.
+
+::
+
+    ptrdiff_t pfft_prod_INT(
+        int d, const ptrdiff_t *vec);
+
+Returns the product over all elements of ``vec``.
+
+::
+
+    ptrdiff_t pfft_sum_INT(
+        int d, const ptrdiff_t *vec);
+
+Returns the sum over all elements of ``vec``.
+
+::
+
+    int pfft_equal_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2);
+
+Returns 1 if both arrays have equal entries, 0 otherwise.
+
+::
+
+    void pfft_vcopy_INT(
+        int d, const ptrdiff_t *vec1,
+        ptrdiff_t *vec2);
+
+Copies the elements of ``vec1`` into ``vec2``.
+
+::
+
+    void pfft_vadd_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
+        ptrdiff_t *sum);
+
+Fills ``sum`` with the componentwise sum of ``vec1`` and ``vec2``.
+
+::
+
+    void pfft_vsub_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
+        ptrdiff_t *sum);
+
+Fills ``sum`` with the componentwise difference of ``vec1`` and
+``vec2``.
+
+Print Three-Dimensional Arrays in Parallel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the following routine to print the elements of a block decomposed
+three-dimensional (real or complex valued) array ``data`` in a nicely
+formatted way.
+
+::
+
+    void pfft_apr_real_3d(
+        const double *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const char *name, MPI_Comm comm);
+    void pfft_apr_complex_3d(
+        const pfft_complex *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const char *name, MPI_Comm comm);
+
+Obviously, this makes only sense for arrays of moderate size. The block
+decomposition is given by ``local_n``, ``local_n_start`` as returned by
+the array distribution function decribed in Section [sec:local-size].
+Furthermore, some arbitrary string ``name`` can be added at the
+beginning of each output - typically this will be the name of the array.
+Communicator ``comm`` must be suitable to the block decomposition and is
+used to synchronize the outputs over all processes.
+
+Generalizations for the case where the dimensions of the local arrays
+are permuted are given by
+
+::
+
+    void pfft_apr_real_permuted_3d(
+        const double *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        int perm0, int perm1, int perm2,
+        const char *name, MPI_Comm comm);
+    void pfft_apr_complex_permuted_3d(
+        const pfft_complex *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        int perm0, int perm1, int perm2,
+        const char *name, MPI_Comm comm);
+
+Hereby, ``perm0``, ``perm1``, and ``perm2`` give the array’s permutation
+of dimension.
+
+Reading Command Line Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following function offers a simple way to read command line
+arguments into an array ``parameter``.
+
+::
+
+    void pfft_get_args(
+        int argc, char **argv, const char *name,
+        int neededArgs, unsigned type,
+        void *parameter);
+
+Hereby, ``argc`` and ``argv`` are the standard argument of the ``main``
+routine. Furthermore, ``name``, ``neededAgrs``, and ``type`` give the
+name, number of entries and the type of the command line argument.
+Supported types are ``PFFT_INT``, ``PFFT_PTRDIFF_T``, ``PFFT_FLOAT``,
+``PFFT_DOUBLE``, and ``PFFT_UNSIGNED``, which denote the standard C type
+that is used for typecasting. In addition, you can use the special type
+``PFFT_SWITCH`` that is an integer type equal to one if the
+corresponding command line argument is given. The array ``parameter``
+must be of sufficient size to hold ``neededArgs`` elements of the given
+data type. Special attention is given
+
+For example, a program containing the following code snippet
+
+::
+
+    double x=0.1;
+    pfft_get_args(argc, argv, "-pfft_x", 1, PFFT_DOUBLE, &x);
+    int np[2]={2,1};
+    pfft_get_args(argc, argv, "-pfft_np", 2, PFFT_INT, np);
+    ptrdiff_t n[3]={32,32,32};
+    pfft_get_args(argc, argv, "-pfft_n", 3, PFFT_PTRDIFF_T, n);
+    int switch=0;
+    pfft_get_args(argc, argv, "-pfft_on", 0, PFFT_SWITCH, switch);
+
+that is executed via
+
+::
+
+    ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on
+
+will read ``x=3.1``, ``np[2] = (2,3)``, ``n[3]= (8,16,32)``, and turn on
+the ``switch=1``. Note the address operator ``&`` in front of ``x`` in
+the second line! Furthermore, note that the initialization of all
+variables with default values before the call of ``pfft_get_args``
+avoids trouble if the user does not provide all the command line
+arguments.
+
+Parallel Substitutes for ``vprintf``, ``fprintf``, and ``printf``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions are similar to the standard C function
+``vfprintf``, ``fprintf`` and ``printf`` with the exception, that only
+rank ``0`` within the given communicator ``comm`` will produce output.
+The intension is to avoid the flood of messages that is produced when
+simple ``printf`` statement are run in parallel.
+
+::
+
+    void pfft_vfprintf(
+        MPI_Comm comm, FILE *stream, const char *format, va_list ap);
+    void pfft_fprintf(
+        MPI_Comm comm, FILE *stream, const char *format, ...);
+    void pfft_printf(
+        MPI_Comm comm, const char *format, ...);
+
+Generating Periodic Cartesian Communicators
+-------------------------------------------
+
+Based on the processes that are part of the given communicator ``comm``
+the following routine
+
+::
+
+    int pfft_create_procmesh_1d(
+        MPI_Comm comm, int np0,
+        MPI_Comm *comm_cart_1d);
+
+allocates and creates a one-dimensional, periodic, Cartesian
+communicator ``comm_cart_1d`` of size ``np0``. Thereby, a non-zero error
+code is returned whenever ``np0`` does not fit the size of ``comm``. The
+memory of the generated communicator should be released with
+``MPI_Comm_free`` after usage. Analogously, use
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+in order to allocate and create two-dimensional, periodic, Cartesian
+communicator ``comm_cart_2d`` of size ``np0*np1`` or
+
+::
+
+    int pfft_create_procmesh(
+        int rnk_np, MPI_Comm comm, const int *np,
+        MPI_Comm *comm_cart);
+
+in order to allocate and create a ``rnk_np``-dimensional, periodic,
+Cartesian communicator of size ``np[0]*np[1]*...*np[rnk_np-1]``. Hereby,
+``np`` is an array of length ``rnk_np``. Again, the memory of the
+generated communicator should be released with ``MPI_Comm_free`` after
+usage.
diff --git a/doc/reference.tex b/doc/reference.tex
index 77d8a74..315148b 100644
--- a/doc/reference.tex
+++ b/doc/reference.tex
@@ -1177,7 +1177,7 @@ \subsection{Reading Command Line Arguments}
 \begin{lstlisting}
 ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on
 \end{lstlisting}
-will read \code{x=3.1}, \code{np[2] = \{2,3\}}, \code{n[3]=\{8,16,32\}}, and turn on the \code{switch=1}.
+will read \code{x=3.1}, \code{np[2] = (2,3)}, \code{n[3]= (8,16,32)}, and turn on the \code{switch=1}.
 Note the address operator \code{&} in front of \code{x} in the second line!
 Furthermore, note that the initialization of all variables with default values before the call of \code{pfft_get_args}
 avoids trouble if the user does not provide all the command line arguments.
diff --git a/doc/shortcuts.tex b/doc/shortcuts.tex
index 7d664e5..d02b976 100644
--- a/doc/shortcuts.tex
+++ b/doc/shortcuts.tex
@@ -6,23 +6,23 @@
 
 
 % Shortcuts for math symbols.
-\newcommand{\N}{\ensuremath{\mathbb{N}}}
-\newcommand{\T}{\ensuremath{\mathbb{T}}}
-\renewcommand{\S}{\ensuremath{\mathbb{S}}}
-\newcommand{\NZ}{\ensuremath{\mathbb{N}_{0}}}
-\newcommand{\Z}{\ensuremath{\mathbb{Z}}}
-\newcommand{\R}{\ensuremath{\mathbb{R}}}
-\newcommand{\Rp}{\ensuremath{\mathbb{R}_{+}}}
-\newcommand{\Rn}{\ensuremath{\mathbb{R}^n}} 
-\newcommand{\Rnn}{\ensuremath{\mathbb{R}^{n \times n}}}
-\newcommand{\C}{\ensuremath{\mathbb{C}}}
-\newcommand{\cO}{\ensuremath{\mathcal{O}}}
-\newcommand{\tT}{\ensuremath{\text{\tiny{T}}}}
-\newcommand{\ti}{\ensuremath{\text{\scriptsize{i}}}}
-\newcommand{\e}{{\ensuremath{\mathrm{e}}}}
-\newcommand{\eim}[1]{\ensuremath{\mathrm{e}^{-2\pi{\ti} #1}}}
-\newcommand{\eip}[1]{\ensuremath{\mathrm{e}^{+2\pi{\ti} #1}}}
-\renewcommand{\mathbf}[1]{\ensuremath{\boldsymbol{#1}}}
+\newcommand{\N}{{\mathbb{N}}}
+\newcommand{\T}{{\mathbb{T}}}
+\renewcommand{\S}{{\mathbb{S}}}
+\newcommand{\NZ}{{\mathbb{N}_{0}}}
+\newcommand{\Z}{{\mathbb{Z}}}
+\newcommand{\R}{{\mathbb{R}}}
+\newcommand{\Rp}{{\mathbb{R}_{+}}}
+\newcommand{\Rn}{{\mathbb{R}^n}} 
+\newcommand{\Rnn}{{\mathbb{R}^{n \times n}}}
+\newcommand{\C}{{\mathbb{C}}}
+\newcommand{\cO}{{\mathcal{O}}}
+\newcommand{\tT}{{\text{T}}}
+\newcommand{\ti}{{\text{i}}}
+\newcommand{\e}{{{\mathrm{e}}}}
+\newcommand{\eim}[1]{{\mathrm{e}^{-2\pi{\ti} #1}}}
+\newcommand{\eip}[1]{{\mathrm{e}^{+2\pi{\ti} #1}}}
+\renewcommand{\mathbf}[1]{{\boldsymbol{#1}}}
 \newcommand{\ds}{\displaystyle}
 \newcommand{\sinc}{\textrm{sinc}}
 \newcommand{\dist}{\textrm{dist}}
@@ -44,9 +44,9 @@
 {\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
 \newcommand{\bigtimes}{\BIGOP{\times}}
 \def\invisible#1{\textcolor{white}{#1}}
-\newcommand{\Vect}[1]{\ensuremath{\mathbf{#1}}}
-\newcommand{\Mat}[1]{\ensuremath{\mathbf{#1}}}
-\newcommand{\Cal}[1]{\ensuremath{\mathcal{#1}}}
+\newcommand{\Vect}[1]{{\mathbf{#1}}}
+\newcommand{\Mat}[1]{{\mathbf{#1}}}
+\newcommand{\Cal}[1]{{\mathcal{#1}}}
 \newcommand{\fft}{\textsf{FFT}}
 
 \newcommand{\ousetarrow}[2]{\overset{\textsf{#1}}{\underset{\textsf{#2}}{\rightarrow}}}
@@ -183,7 +183,8 @@
   %       belowskip= -2ex}
   % 
 %   \newcommand{\code}[1]{\linebreak[2]{\ttfamily #1}}
-  \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}}
+   \newcommand{\code}[1]{:code:`#1`}
+%  \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}}
 }
 
 %% own enviroment for case differentiation
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
new file mode 100644
index 0000000..cb65b46
--- /dev/null
+++ b/doc/tutorial.rst
@@ -0,0 +1,573 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
+Tutorial
+========
+
+The following chapter describes the usage of the PFFT library at the
+example of a simple test file in the first section, followed by the more
+advanced features of PFFT in the next sections.
+
+A first parallel transform - Three-dimensional FFT with two-dimensional data decomposition
+------------------------------------------------------------------------------------------
+
+We explain the basic steps for computing a parallel FFT with the PFFT
+library at the example of the short test program given by
+Listing [lst:man\ :sub:`c`\ 2c]. This test computes a three-dimensional
+c2c-FFT on a two-dimensional process mesh. The source code
+``manual_c2c_3d.c`` can be found in directory ``tests/`` of the
+library’s source code tree.
+
+After initializing MPI with ``MPI_Init`` and before calling any other
+PFFT routine initialize the parallel FFT computations via
+
+::
+
+    void pfft_init(void);
+
+MPI introduces the concept of communicators to store all the topological
+information of the physical process layout. PFFT requires to be called
+on a process mesh that corresponds to a periodic, Cartesian
+communicator. We assist the user in creating such a communicator with
+the following routine
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+This routine uses the processes within the communicator ``comm`` to
+create a two-dimensional process grid of size ``np0`` x ``np1`` and
+stores it into the Cartesian communicator ``comm_cart_2d``. Note that
+``comm_cart_2d`` is allocated by the routine and must be freed with
+``MPI_Comm_free`` after usage. The input parameter ``comm`` is a
+communicator, indicating which processes will participate in the
+transform. Choosing ``comm`` as ``MPI_COMM_WORLD`` implies that the FFT
+is computed on all available processes.
+
+At the next step we need to know the data decomposition of the input and
+output array, that depends on the array sizes, the process grid and the
+chosen parallel algorithm. Therefore, we call
+
+::
+
+    ptrdiff_t pfft_local_size_3d(
+        ptrdiff_t *n, MPI_Comm comm_cart_2d, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``,
+``local_o_start`` are arrays of length :math:`3` that must be allocated.
+The return value of this function equals the size of the local complex
+array that needs to be allocated by every process. In most cases, this
+coincides with the product of the local array sizes – but may be bigger,
+whenever the parallel algorithm needs some extra storage. The input
+value ``n`` gives the three-dimensional FFT size and the flag
+``pfft_flags`` serves to adjust some details of the parallel execution.
+For the sake of simplicity, we restrict our self to the case
+``pfft_flags=PFFT_TRANSPOSED_NONE`` for a while and explain the more
+sophisticated flags at a later point. The output arrays ``local_ni`` and
+``local_i_start`` give the size and the offset of the local input array
+that result from the parallel block distribution of the global input
+array, i.e., every process owns the input data ``in[k[0],k[1],k[2]]``
+with ``local_i_start[t] <= k[t] < local_i_start[t] ``
+local\ :sub:`n`\ i[t]+ for ``t=0,1,2``. Analogously, the output
+parameters ``local_o_start`` and ``local_no`` contain the size and the
+offset of the local output array.
+
+Afterward, the input and output arrays must be allocated. Hereby,
+
+::
+
+    pfft_complex* pfft_alloc_complex(size_t size);
+
+is a simple wrapper of ``fftw_alloc_complex``, which in turn allocates
+the memory via ``fftw_malloc`` to ensure proper alignment for SIMD. Have
+a look at the FFTW user manual  for more details on SIMD memory
+alignment and ``fftw_malloc``. Nevertheless, you can also use any other
+dynamic memory allocation.
+
+The planning of a single three-dimensional parallel FFT of size ``n[0]``
+x ``n[1]`` x ``n[2]`` is done by the function
+
+::
+
+    pfft_plan pfft_plan_dft_3d(
+        ptrdiff_t *n, pfft_complex *in, pfft_complex *out,
+        MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags);
+
+We provide the address of the input and output array by the pointers
+``in`` and ``out``, respectively. An inplace transform is assumed if
+these pointers are equal. The integer ``sign`` gives the sign in the
+exponential of the FFT. Possible values are ``PFFT_FORWARD``
+(:math:`-1`) and ``PFFT_BACKWARD`` (:math:`+1`). Flags passed to the
+planner via ``pfft\_flags`` must coincide with the flags that were
+passed to ``pfft_local_size_3d``. Otherwise the data layout of the
+parallel execution may not match calculated local array sizes. As return
+value we get a PFFT plan, some structure that stores all the information
+needed to perform a parallel FFT.
+
+Once the plan is generated, we are allowed to fill the input array
+``in``. Note, that per default the planning step ``pfft_plan_dft_3d``
+will overwrite input array ``in``. Therefore, you should not write any
+sensitive data into ``in`` until the plan was generated. For simplicity,
+our test program makes use of the library function
+
+::
+
+    void pfft_init_input_complex_3d(
+        ptrdiff_t *n, ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        pfft_complex *in);
+
+to fill the input array with some numbers. Alternatively, one can fill
+the array with a function ``func`` of choice and the following loop that
+takes account of the parallel data layout
+
+::
+
+    ptrdiff_t m=0;
+    for(ptrdiff_t k0=0; k0 < local_ni[0]; k0++)
+      for(ptrdiff_t k1=0; k1 < local_ni[1]; k1++)
+        for(ptrdiff_t k2=0; k2 < local_ni[2]; k2++)
+          in[m++] = func(k0 + local_i_start[0],
+                         k1 + local_i_start[1],
+                         k2 + local_i_start[2]);
+
+The parallel FFT is computed when we execute the generated plan via
+
+::
+
+    void pfft_execute(const pfft_plan plan);
+
+Now, the results can be read from ``out`` with an analogous
+three-dimensional loop. If we do not want to execute another parallel
+FFT of the same type, we free the allocated memory of the plan with
+
+::
+
+    void pfft_destroy_plan(pfft_plan plan);
+
+Additionally, we use
+
+::
+
+    int MPI_Comm_free(MPI_Comm *comm);  
+
+to free the communicator allocated by ``pfft_create_procmesh_2d`` and
+
+::
+
+    void pfft_free(void *ptr);
+
+to free memory allocated by ``pfft_alloc_complex``. Finally, we exit MPI
+via
+
+::
+
+    int MPI_Finalize(void);
+
+Porting FFTW-MPI based code to PFFT
+-----------------------------------
+
+We illustrate the close connection between FFTW-MPI and PFFT at a
+three-dimensional MPI example analogous to the example given in the FFTW
+manual .
+
+Exactly the same task can be performed with PFFT as given in
+Listing [lst:pfft\ :sub:`3`\ don1d].
+
+::
+
+    #include <pfft.h>
+         
+    int main(int argc, char **argv)
+    {
+        const ptrdiff_t n[3] = {..., ..., ...};
+        pfft_plan plan;
+        pfft_complex *data;
+        ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+        unsigned pfft_flags = 0;
+
+        MPI_Init(&argc, &argv);
+        pfft_init();
+
+        /* get local data size and allocate */
+        alloc_local = pfft_local_size_dft_3d(n, MPI_COMM_WORLD, pfft_flags,
+                             local_ni, local_i_start,
+                             local_no, local_o_start);
+        data = pfft_alloc_complex(alloc_local);
+
+        /* create plan for in-place forward DFT */
+        plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+                    PFFT_FORWARD, PFFT_ESTIMATE);
+
+        /* initialize data to some function my_function(x,y,z) */
+        for (i = 0; i < local_n[0]; ++i) 
+          for (j = 0; j < n[1]; ++j) 
+            for (k = 0; k < n[2]; ++k)
+              data[i*n[1]*n[2] + j*n[2] + k] = my_function(local_i_start[0] + i, j, k);
+
+        /* compute transforms, in-place, as many times as desired */
+        pfft_execute(plan);
+
+        pfft_destroy_plan(plan);
+
+        MPI_Finalize();
+    }
+
+substitute ``fftw3-mpi.h`` by ``pfft.h``
+
+substitute all prefixes ``fftw_`` and ``fftw_mpi_`` by ``pfft_``
+
+substitute all prefixes ``FFTW_`` by ``PFFT_``
+
+the integers ``N``, ``local_n0``, ``local_0_start`` become arrays of
+length 3
+
+``dft_`` in ``pfft_local_size_dft_3d``
+
+``pfft_local_size_dft_3d`` has additional input ``pfft_flags`` and
+additional outputs ``local_no``, ``local_o_start``
+
+The loop that inits ``data`` becomes splitted along all three
+dimensions. We could also use
+
+First, All prefixes ``fftw_`` are substituted by ``pfft_``
+
+Now, the changes in order to use a two-dimensional process mesh are
+marginal as can be seen in Listing [lst:pfft\ :sub:`3`\ don2d].
+
+::
+
+    #include <pfft.h>
+         
+    int main(int argc, char **argv)
+    {
+        const ptrdiff_t n[3] = {..., ..., ...};
+        (red@const int np0 = ..., np1 = ...;@*)
+        pfft_plan plan;
+        pfft_complex *data;
+        ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+        unsigned pfft_flags = 0;
+        (red@MPI_Comm comm_cart_2d;@*)
+
+        MPI_Init(&argc, &argv);
+        pfft_init();
+
+        (red@/* create two-dimensional process grid of size np0 x np1 */@*)
+        (red@pfft_create_procmesh_2d(MPI_COMM_WORLD, np0, np1,@*)
+            (red@&comm_cart_2d);@*)
+        
+        /* get local data size and allocate */
+        alloc_local = pfft_local_size_dft_3d(n, (red@comm_cart_2d@*), pfft_flags,
+                             local_ni, local_i_start,
+                             local_no, local_o_start);
+        data = pfft_alloc_complex(alloc_local);
+
+        /* create plan for in-place forward DFT */
+        plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+                    PFFT_FORWARD, PFFT_ESTIMATE);
+
+        /* initialize data to some function my_function(x,y,z) */
+        for (i = 0; i < local_n[0]; ++i) 
+          for (j = 0; j < (red@local_n[1]@*); ++j) 
+            for (k = 0; k < (red@local_n[2]@*); ++k)
+              data[i*(red@local_n[1]*local_n[2]@*) + j*(red@local_n[2]@*) + k] =
+                  my_function(local_i_start[0] + i,
+                      (red@local_i_start[1] +@*) j,
+                      (red@local_i_start[2] +@*) k);
+
+        /* compute transforms, in-place, as many times as desired */
+        pfft_execute(plan);
+
+        pfft_destroy_plan(plan);
+
+        MPI_Finalize();
+    }
+
+Errorcode for communicator creation
+-----------------------------------
+
+As we have seen the function
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+creates a two-dimensional, periodic, Cartesian communicator. The ``int``
+return value (not used in Listing [lst:man\ :sub:`c`\ 2c]) is the
+forwarded error code of ``MPI_Cart_create``. It is equal to zero if the
+communicator was created successfully. The most common error is that the
+number of processes within the input communicator ``comm`` does not fit
+``np0 x np1``. In this case the Cartesian communicator is not generated
+and the return value is unequal to zero. Therefore, a typical sanity
+check might look like
+
+::
+
+    /* Create two-dimensional process grid of size np[0] x np[1],
+       if possible */
+    if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1],
+            &comm_cart_2d) )
+    {
+      pfft_fprintf(MPI_COMM_WORLD, stderr,
+          "Error: This test file only works with %d processes.\n",
+          np[0]*np[1]);
+      MPI_Finalize();
+      return 1;
+    }
+
+Hereby, we use the PFFT library function
+
+::
+
+    void pfft_fprintf(
+        MPI_Comm comm, FILE *stream, const char *format, ...);
+
+to print the error message. This function is similar to the standard C
+function ``fprintf`` with the exception, that only the process with MPI
+rank :math:`0` within the given communicator ``comm`` will produce some
+output; see Section [sec:fprintf] for details.
+
+Inplace transforms
+------------------
+
+Similar to FFTW, PFFT is able to compute parallel FFTs completely in
+place, which means that beside some constant buffers, no second data
+array is necessary. Especially, the global data communication can be
+performed in place. As far as we know, there is no other parallel FFT
+library beside FFTW and PFFT that supports this feature. This feature is
+enabled as soon as the pointer to the output array ``out`` is equal to
+the pointer to the input array ``in``. E.g., in
+Listing [lst:man\ :sub:`c`\ 2c] we would call
+
+::
+
+    /* Plan parallel forward FFT */
+    plan = pfft_plan_dft_3d(n, in, in, comm_cart_2d,
+        PFFT_FORWARD, PFFT_TRANSPOSED_NONE);
+
+Higher dimensional data decomposition
+-------------------------------------
+
+The test program given in Listing [lst:man\ :sub:`c`\ 2c] used a
+two-dimensional data decomposition of a three-dimensional data set.
+Moreover, PFFT support the computation of any :math:`d`-dimensional FFT
+with :math:`r`-dimensional data decomposition as long as
+:math:`r\le d-1`. For example, one can use a one-dimensional data
+decomposition for any two- or higher-dimensional data set, while the
+data set must be at least four-dimensional to fit to a three-dimensional
+data decomposition. The case :math:`r=d` is not supported efficiently,
+since during the parallel computations there is always at least one
+dimension that remains local, i.e., one dimensions stays non-decomposed.
+The only exception from this rule is the case :math:`d=r=3` that is
+supported by PFFT in a special way, see Section [sec:3don3d] for
+details.
+
+The dimensionality of the data decomposition is given by the dimension
+of the Cartesian communicator that goes into the PFFT planing routines.
+Therefore, we present a generalization of communicator creation function
+
+::
+
+    int pfft_create_procmesh(
+        int rnk_np, MPI_Comm comm, const int *np,
+        MPI_Comm *comm_cart);
+
+Hereby, the array ``np`` of length ``rnk_np`` gives the size of the
+Cartesian communicator ``cart_comm``.
+
+Parallel data decomposition
+---------------------------
+
+In the following, we use the notation :math:`\frac{n}{P}` to symbolize
+that an array of length :math:`n` is broken into disjoint blocks and
+distributed on :math:`P` MPI processes. Hereby, the data is distributed
+in compliance to the FFTW-MPI data decompostion , i.e., the first
+``P/block`` (rounded down) processes get a contiguous chunk of ``block``
+elements, the next process gets the remaining ``n - block * (n/block)``
+data elements, and all remaining processes get nothing. Thereby, the
+block size ``block`` defaults to ``n/P`` (rounded down) but can also be
+user defined.
+
+Non-transposed and transposed data layout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the following, we use the notation :math:`\frac{n}{P}` to symbolize
+that an array of length :math:`n` is distributed on :math:`P` MPI
+processes. The standard PFFT data decomposition of :math:`h` interleaved
+:math:`d`-dimensional arrays of equal size
+:math:`n_0 \times n_1\times \dots \times n_{d-1}` on a
+:math:`r`-dimensional process mesh of size
+:math:`P_0\times \dots \times P_{r-1}` is given by the blocks
+
+.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \dots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \dots \times n_{d-1} \times h.
+
+A PFFT created with planning flag ``PFFT_TRANSPOSED_NONE`` requires the
+inputs to be decomposed in this standard way and produces outputs that
+are decomposed in the same way.
+
+PFFT can save half of the global communication amount, if the data
+reordering to standard decomposition is omitted. The transposed data
+decomposition is given by
+
+.. math:: \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \dots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \dots \times n_{d-1} \times h
+
+A PFFT plan created with planning flag ``PFFT_TRANSPOSED_OUT`` produces
+outputs with transposed data decomposition. Analogously, a PFFT plan
+created with planning flag ``PFFT_TRANSPOSED_IN`` requires its inputs to
+be decomposed in the transposed way. Typically, one creates a forward
+plan with ``PFFT_TRANSPOSED_OUT`` and a backward plan with planning flag
+``PFFT_TRANSPOSED_IN``.
+
+Note that the flags ``PFFT_TRANSPOSED_OUT`` and ``PFFT_TRANSPOSED_IN``
+must be passed to the array distribution function (see
+Section [sec:local-size]) *as well as* to the planner (see
+Section [sec:create-plan]).
+
+Three-dimensional FFTs with three-dimensional data decomposition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Many applications work with three-dimensional block decompositions of
+three-dimensional arrays. PFFT supports decompositions of the kind
+
+.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \frac{n_2}{P_2} \times h.
+
+However, PFFT applies a parallel algorithms that needs at least one
+non-distributed transform dimension (we do not transform along
+:math:`h`), Therefore, we split the number of processes along the last
+dimension into two factors :math:`P_2=Q_1Q_2`, remap the data to the
+two-dimensional decomposition
+
+.. math:: \frac{n_0}{P_0Q_0} \times \frac{n_1}{P_1Q_1} \times n_2 \times h,
+
+and compute the parallel FFT with this two-dimensional decomposition.
+Note that the 3d to 2d remap implies some very special restrictions on
+the block sizes for :math:`n_0` and :math:`n_1`, i.e., the blocks must
+be divisible by :math:`Q_0` and :math:`Q_1`. More precisely, the default
+blocks of the 2d-decomposition are given by ``n0/(P0*Q0)`` and
+``n1/(P1*Q1)`` (both divisions rounded down). This implies that the
+default blocks of the 3d-decomposition must be ``n0/(P0*Q0) * Q0``,
+``n1/(P1*Q1) * Q1``, and ``n2/(Q0*Q1)`` (all divisions rounded down).
+
+Planning effort
+---------------
+
+Pass one of the following flags
+
+``PFFT_ESTIMATE``,
+
+``PFFT_MEASURE``,
+
+``PFFT_PATIENT``, or,
+
+``PFFT_EXHAUSIVE``
+
+to the PFFT planner in order to plan all internal FFTW plans with
+``FFTW_ESTIMATE``, ``FFTW_MEASURE``, ``FFTW_PATIENT``, or
+``FFTW_EXHAUSIVE``, respectively. The default value is ``PFFT_MEASURE``.
+
+PFFT uses FFTW plans for parallel array transposition and the serial
+transforms. In fact, every serial transform is a combination of strided
+lower-dimensional FFTs and a serial array transposition (necessary to
+prepare the global transposition) which can be done by a single FFTW
+plan. However, it turns out that FFTW sometimes performs better if the
+serial transposition and the strided FFTs are executed separately.
+Therefore, PFFT introduces the flag ``PFFT_TUNE`` that enables extensive
+run time tests in order to find the optimal sequence of serial strided
+FFT and serial transposition for every serial transform. These tests are
+disable on default which corresponds to the flag ``PFFT_NO_TUNE``.
+
+Preserving input data
+---------------------
+
+The following flags
+
+``PFFT_PRESERVE_INPUT``,
+
+``PFFT_DESTROY_INPUT``, and,
+
+``PFFT_BUFFERED_INPLACE``
+
+only take effect for out-of-place transforms. The first one behaves
+analogously to the FFTW flag ``FFTW_PRESERVE_INPUT`` and ensures that
+the input values are not overwritten. In fact, this flag implies that
+only the first serial transform is executed out-of-place and all
+successive steps are performed in-place on the output array. In
+compliance to FFTW, this is the default behaviour for out-of-place
+plans.
+
+The second flag behaves analogously to the FFTW flag
+``FFTW_DESTROY_INPUT`` and tells the planner that the input array can be
+used as scratch array. This may give some speedup for out-of-place
+plans, because all the intermediate transforms and transposition steps
+can be performed out-of-place.
+
+Finally, the flag ``PFFT_BUFFERED_INPLACE`` can be used for out-of-place
+plans that store its inputs and outputs in the same array, i.e., array
+``out`` is used for intermediate out-of-place transforms and
+transpositions but the PFFT inputs and outputs are stored in array
+``in``.
+
+FFTs with shifted index sets
+----------------------------
+
+``PFFT_SHIFTED_IN``
+
+``PFFT_SHIFTED_OUT``
+
+Pruned FFT and Shifted Index Sets
+---------------------------------
+
+Pruned FFT
+~~~~~~~~~~
+
+For pruned r2r- and c2c-FFT are defined as
+
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1,
+
+where :math:`n_i\le n` and :math:`n_o\le n`.
+
+Shifted Index Sets
+~~~~~~~~~~~~~~~~~~
+
+For :math:`N\in 2{{\mathbb{N}}}` we define the FFT with shifted inputs
+
+For :math:`K,L,N\in 2{{\mathbb{N}}}`, :math:`L<N`, :math:`L<N` we define
+
+Precisions
+----------
+
+PFFT handles multiple precisions exactly in the same way as FFTW.
+Therefore, we quote part  of the FFTW manual in the context of PFFT:
+
+You can install single and long-double precision versions of PFFT, which
+replace double with float and long double, respectively; see
+[sec:install]. To use these interfaces, you must
+
+Link to the single/long-double libraries; on Unix, ``-lpfftf`` or
+``-lpfftl`` instead of (or in addition to) ``-lpfft``. (You can link to
+the different-precision libraries simultaneously.)
+
+Include the same ``<pfft.h>`` header file.
+
+Replace all lowercase instances of ‘``pfft_``’ with ‘``pfftf_``’ or
+‘``pfftl_``’ for single or long-double precision, respectively.
+(``pfft_complex`` becomes ``pfftf_complex``, ``pfft_execute`` becomes
+``pfftf_execute``, etcetera.)
+
+Uppercase names, i.e. names beginning with ‘``PFFT_``’, remain the same.
+
+Replace ``double`` with ``float`` or ``long double`` for subroutine
+parameters.
+
+Ghost cell communication
+------------------------
+
+Fortran interface
+-----------------
+