From eaabacd752d7415c6b19d54870b960651077f72a Mon Sep 17 00:00:00 2001
From: Yu Feng <yfeng1@waterfall.dyn.berkeley.edu>
Date: Sun, 13 Sep 2015 01:21:57 -0700
Subject: [PATCH 1/6] Initially setup sphinx build.

---
 doc/Makefile-sphinx | 192 +++++++++++++++++++++++++++++
 doc/conf.py         | 287 ++++++++++++++++++++++++++++++++++++++++++++
 doc/index.rst       |  22 ++++
 doc/make.bat        | 263 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 764 insertions(+)
 create mode 100644 doc/Makefile-sphinx
 create mode 100644 doc/conf.py
 create mode 100644 doc/index.rst
 create mode 100644 doc/make.bat
diff --git a/doc/Makefile-sphinx b/doc/Makefile-sphinx
new file mode 100644
index 0000000..33668b7
--- /dev/null
+++ b/doc/Makefile-sphinx
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PFFT.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PFFT.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/PFFT"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PFFT"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..7d0dd0d
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,287 @@
+# -*- coding: utf-8 -*-
+#
+# PFFT documentation build configuration file, created by
+# sphinx-quickstart on Sun Sep 13 01:20:34 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.todo',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'PFFT'
+copyright = u'2015, Michael Pippig'
+author = u'Michael Pippig'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.0.8'
+# The full version, including alpha/beta/rc tags.
+release = '1.0.8'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PFFTdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'PFFT.tex', u'PFFT Documentation',
+   u'Michael Pippig', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pfft', u'PFFT Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'PFFT', u'PFFT Documentation',
+   author, 'PFFT', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..705487a
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,22 @@
+.. PFFT documentation master file, created by
+   sphinx-quickstart on Sun Sep 13 01:20:34 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PFFT's documentation!
+================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..d41f63f
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,263 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PFFT.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PFFT.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end

From b3704cf6ab44bd0235017575b664f52ef20de9bf Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 13 Sep 2015 01:36:06 -0700
Subject: [PATCH 2/6] Import rst files.

---
 doc/conf.py       |   2 +-
 doc/develop.rst   |  42 +++++
 doc/features.rst  | 161 +++++++++++++++++
 doc/fortran.rst   |   3 +
 doc/index.rst     |   8 +
 doc/install.rst   | 103 +++++++++++
 doc/interface.rst |  95 ++++++++++
 doc/intro.rst     |  94 ++++++++++
 doc/tutorial.rst  | 437 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 944 insertions(+), 1 deletion(-)
 create mode 100644 doc/develop.rst
 create mode 100644 doc/features.rst
 create mode 100644 doc/fortran.rst
 create mode 100644 doc/install.rst
 create mode 100644 doc/interface.rst
 create mode 100644 doc/intro.rst
 create mode 100644 doc/tutorial.rst

diff --git a/doc/conf.py b/doc/conf.py
index 7d0dd0d..acaf658 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -111,7 +111,7 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'alabaster'
+html_theme = 'nature'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
diff --git a/doc/develop.rst b/doc/develop.rst
new file mode 100644
index 0000000..a728ce8
--- /dev/null
+++ b/doc/develop.rst
@@ -0,0 +1,42 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Developers Guide}\label{chap:develop}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+
+\section{Search and replace patterns}
+
+Correct alignment of pfft.h header
+\begin{lstlisting}
+%s/^\(    [^ ]\+[^\\]*\)  \\/  \1\\/g  
+\end{lstlisting}
+
+Expand most macros of pfft.h to generate the function reference of this manual:
+\begin{lstlisting}[language=bash,prebreak=\textbackslash,]
+sed -e 's/ *\\$//g' -e 's/PFFT_EXTERN //g' \
+    -e 's/PX(\([^)]*\))/pfft_\1/g' -e 's/ INT/ ptrdiff_t/g' \
+    -e 's/ R/ double/g' -e 's/ C/ pfft_complex/g' \
+    -e 's/^  //g' pfft.h > pfft.h.expanded
+\end{lstlisting}
+
+
+
+
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{ToDo}\label{chap:todo}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{itemize}
+  \item \code{PFFT_FORWARD} is defined as \code{FFTW_FORWARD}
+  \item \code{FFTW_FORWARD} is defined as $-1$
+  \item PFFT allows to chose between \code{FFTW_FORWARD} and \code{FFTW_BACKWARD}, which is not implemented by FFTW.
+  \item Matlab uses the same sign convention, i.e., $-1$ for \code{fft} and $+1$ for \code{ifftn}
+\end{itemize}
+
+\section{Measuring parallel run times}
+Use \code{MPI_Barrier} in front of every call to \code{pfft_} function to avoid unbalanced run times.
+
diff --git a/doc/features.rst b/doc/features.rst
new file mode 100644
index 0000000..13c48a1
--- /dev/null
+++ b/doc/features.rst
@@ -0,0 +1,161 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Advanced Features}\label{chap:feat}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+%------------------------------------------------------------------------------
+\section{How to Deal with FFT Index Shifts in Parallel}
+%------------------------------------------------------------------------------
+Let $n\in2\N$. A common problem is that the index of the FFT input and/or output array runs between $-\nicefrac n2,\hdots,\nicefrac n2-1$,
+but the FFT library requires them to run between $0,\hdots,n-1$. With serial program execution one can easily remap the input data $\hat g_k$
+in a way that is suitable for the library, i.e.,
+\begin{equation*}
+  \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\hdots,n-1.
+\end{equation*}
+Similarly, one could remap the outputs of the library $f_l$, $l=0,\cdots,n-1$ in the opposite direction in order to get the
+required outputs, i.e.,
+\begin{equation*}
+  g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\hdots,\nicefrac n2-1.
+\end{equation*}
+These shifts are also known as \code{fftshift} in Matlab.
+
+However, with distributed memory these \code{fftshift} operations require more complex data movements and result in a global communication.
+For example, the first index of the array moves to the middle and, therefore, the corresponding data move to another MPI process.
+Fortunately, this communication can be avoided at the cost of little extra computation.
+At the end of the section we present two PFFT library functions that perform the necessary pre- and postprocessing
+for shifted input and output index sets.
+
+\subsection{Shift with half the FFT size}
+
+The special case of input shift $k_s=-\nicefrac n2$ and/or output shift $l_s=-\nicefrac n2$ is supported by PFFT.
+User can choose to shift the input (\verb+PFFT_SHIFTED_IN+) and/or to shift the output (\verb+PFFT_SHIFTED_OUT+).
+\todo{this flag can be used for \code{local_size} and planning}
+
+Here, we are interested in the computation of
+\begin{equation*}
+  g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1  
+\end{equation*}
+with $n, n_i, n_o \in 2\N$ and $n>n_i$, $n>n_o$.
+
+With an index shift of $\nicefrac n2$ both in $k$ and $l$ this equivalent to the computation of
+\begin{align*}
+  g_{(l-\nicefrac{n}{2})}
+  &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+     \hat g_{(k-\nicefrac{n}{2})} \eim{(k-\nicefrac n2)(l-\nicefrac n2)/n} \\
+  &= \e^{+\pi\ti l} 
+       \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+       \left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti (k-\nicefrac n2)}\right) \eim{kl/n} \\
+  &= \e^{+\pi\ti(l-\nicefrac n2)} 
+     \underbrace{
+       \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+       \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti k}\right)}_{\hat f_k} \eim{kl/n}
+     }_{f_l}
+\end{align*}
+for $ l=\nicefrac n2-\nicefrac{n_o}{2},\hdots,\nicefrac n2 +\nicefrac{n_o}{2}-1$.
+Therefore, we get the following algorithm
+
+\begin{equation*}
+  f_l = \sum_{k=0}^n \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1  
+\end{equation*}
+
+The special case $k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}$ corresponds to the shifts the arrays (\textsf{FFTSHIFT})
+\begin{algorithm}
+  \begin{algorithmic}[1]
+    \itemsep=1.1ex
+    \State For $k=0,\hdots,n-1$ set $\hat f_k = 0$.
+    \State For $k=-\nicefrac{n_i}{2},\hdots,\nicefrac{n_i}{2}-1$ compute $\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}$.
+    \State For $l=0,\hdots,n-1$ compute $f_l = \sum_{k=0}^{n} \hat f_k \eim{kl/n}$ using PFFT.
+    \State For $l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1$ compute $g_l = (-1)^l f_{(l+n/2)} $.
+  \end{algorithmic}
+\end{algorithm}
+
+
+Note, that this shift implies that the library deals with pruned FFTs in a special way, i.e., half of the zeros are added
+at the beginning of the inputs and the other half is added at the end.
+
+
+
+
+
+
+\subsection{Arbitrary shifts}
+More general shifts must be done by the user.
+
+
+In a more general setting, we are interested in the computation of FFTs with shifted index sets, i.e., assume $k_s,l_s\in\Z$ and compute
+\begin{equation*}
+  g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{kl/n},
+  \quad l=l_s,\hdots,n_o+l_s-1\,.
+\end{equation*}
+Because of the periodicity of the FFT this can be easily performed by \algname~\ref{alg:fftshift_translation}.
+\begin{algorithm}\label{alg:fftshift_translation}
+  \begin{algorithmic}[1]
+    \itemsep=1.1ex
+    \State For $k=0,\hdots,n_i-1$ assign $\hat f_k = \hat g_{(k+k_s\bmod n_i)}$.
+    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
+    \State For $l=0,\hdots,n_o-1$ assign $g_l = f_{(l-l_s\bmod n_o)}$.
+  \end{algorithmic}
+  \caption{Shifted FFT with explicit data movement.}
+\end{algorithm}
+However, this involves explicit data movement since the sequence of data changes.
+For a our parallel data decomposition the change of data layout requires data communication.
+A simple index shift results in the computation of
+\begin{align*}
+  g_{l+l_s}
+  &=
+    \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{k(l+l_s)/n}
+    =
+    \sum_{k=0}^{n_i-1} \hat g_{k+k_s} \eim{(k+k_s)(l+l_s)/n} \\
+  &=
+    \eim{k_sl/n} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}\eim{(k+k_s)l_s/n}\right)}_{=: \hat f_k} \eim{kl/n}
+\end{align*}
+for all $l=0,\hdots,n_o-1$. The resulting \algname~\ref{alg:fftshift_modulation} preserves the sequence of
+data at the price of some extra computation.
+\begin{algorithm}\label{alg:fftshift_modulation}
+  \begin{algorithmic}[1]
+    \itemsep=1.1ex
+    \State For $k=0,\hdots,n_i-1$ compute $\hat f_k = \hat g_{(k+k_s)} \eim{(k+k_s)l_s/n}$.
+    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
+    \State For $l=0,\hdots,n_o-1$ compute $g_{(l+l_s)} = f_l \eim{k_sl/n}$.
+  \end{algorithmic}
+  \caption{Shifted FFT without explicit data movement.}
+\end{algorithm}
+
+The special case $k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}$ corresponds to the shifts the arrays (\textsf{FFTSHIFT})
+\begin{algorithm}
+  \begin{algorithmic}[1]
+    \itemsep=1.1ex
+    \State For $k=0,\hdots,n_i-1$ compute $\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} \e^{+\pi\ti (k-\nicefrac{n_i}{2})n_o/n}$.
+    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
+    \State For $l=0,\hdots,n_o-1$ compute $g_{(l-\nicefrac{n_o}{2})} = f_l \e^{+\pi\ti n_i l/n}$.
+  \end{algorithmic}
+\end{algorithm}
+
+
+
+
+%------------------------------------------------------------------------------
+\section{Parallel pruned FFT}
+%------------------------------------------------------------------------------
+Within PFFT we define a pruned FFT as
+\begin{equation*}
+  g_l = \sum_{k=0}^{n_i-1} \hat g_{k} \eim{kl/n}, \quad l=0,\hdots,n_o-1.
+\end{equation*}
+Formally, this is equivallent to the following regular size $n$ FFT
+\begin{equation*}
+  f_l = \sum_{k=0}^{n-1} \hat f_{k} \eim{kl/n}, \quad l=0,\hdots,n,
+\end{equation*}
+with 
+\begin{equation*}
+  \hat g_k := 
+  \begin{cases}
+  \hat f_k, &: k=0,\hdots,n_1-1, \\
+  0         &: k=n_i,\hdots,n-1,    
+  \end{cases}
+\end{equation*}
+and $f_l := g_l$, $k=0,\hdots,n_o-1$. I.e., we add $n-n_i$ zeros at the end of the input array and throw away $n-n_o$ entries at the end of the output array.
+
+
+The definition of pruned FFT changes for \code{PFFT_SHIFTED_IN} and \code{PFFT_SHIFTED_OUT}.
+
+
diff --git a/doc/fortran.rst b/doc/fortran.rst
new file mode 100644
index 0000000..0f2d792
--- /dev/null
+++ b/doc/fortran.rst
@@ -0,0 +1,3 @@
+\chapter{Fotran Interface}
+
+based on Fortran 90
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
index 705487a..4a5cbab 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -11,6 +11,14 @@ Contents:
 .. toctree::
    :maxdepth: 2
 
+  intro
+  tutorial
+  install
+  features
+  interface
+  reference
+  develop
+
 
 
 Indices and tables
diff --git a/doc/install.rst b/doc/install.rst
new file mode 100644
index 0000000..c8b96bf
--- /dev/null
+++ b/doc/install.rst
@@ -0,0 +1,103 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Installation and linking}\label{chap:inst}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+The install of PFFT is based on the Autotools and follows the typical workflow
+\begin{lstlisting}[escapechar=§]
+./configure
+make
+make install
+\end{lstlisting}
+
+
+\section{Install of the latest official FFTW release}\label{sec:fftw_inst}
+PFFT depends on Release~\fftwversion{} of the FFTW library~\cite{fftw}.
+For the sake of completeness, we show the command line based install procedure in the following.
+However, note that we provide install scripts on \websoft that simplify the install a lot.
+We highly recommend to use these install scripts, since they additionally apply several
+performance patches and bugfixes that have been submitted to the FFTW developers but
+are not yet included in the official FFTW releases.
+\begin{lstlisting}[escapechar=§]
+wget http://www.fftw.org/fftw-§\fftwversionsl§.tar.gz
+tar xzvf fftw-§\fftwversion§.tar.gz
+cd fftw-§\fftwversion§
+./configure --enable-mpi --prefix=$HOME/local/fftw3_mpi §\label{lst:fftw:conf}§
+make
+make install
+\end{lstlisting}
+The MPI algorithms of FFTW must be build with a MPI C compiler. Add the statement \code{MPICC=\$MPICCOMP}
+at the end of line~\ref{lst:fftw:conf} if the \code{configure} script fails to determine the right
+MPI C compiler \code{\$MPICCOMP}. Similarly, the MPI Fortran compiler \code{\$MPIFCOMP} is set by \code{MPIFC=\$MPIFCOMP}.
+
+\section{Install of the PFFT library}\label{sec:pfft-inst}
+In the simplest case, the hardware platform and the \fftw-\fftwversion{} library are
+recognized by the PFFT configure script automatically, so all we have to do is
+\begin{lstlisting}[escapechar=§]
+wget http://www.tu-chemnitz.de/~mpip/software/pfft-§\pfftversionsl§.tar.gz
+tar xzvf pfft-§\pfftversion§.tar.gz
+cd pfft-§\pfftversion§
+./configure
+make
+make check
+make install
+\end{lstlisting}
+Hereby, the optional call \code{make check} builds the test programs.
+If the \fftw-\fftwversion{} software library is already installed on your system but not found by the configure script,
+you can provide the FFTW installation directory \code{\$FFTWDIR} to configure by
+\begin{lstlisting}[language=bash]
+./configure --with-fftw3=$FFTWDIR
+\end{lstlisting}
+This call implies that the FFTW header files are located in \code{\$FFTWDIR/include} and the FFTW library files are located
+in \code{\$FFTWDIR/lib}. Otherwise, one should specify the FFTW include path \code{\$FFTWINC} and the FFTW library path
+\code{\$FFTWLIB} separately by
+\begin{lstlisting}[prebreak = {\textbackslash}]
+./configure --with-fftw3-includedir=$FFTWINC --with-fftw3-libdir=$FFTWLIB
+\end{lstlisting}
+At the end, this is equivalent to
+\begin{lstlisting}[prebreak = {\textbackslash}]
+./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB
+\end{lstlisting}
+which is more common to experienced users of the Autotools.
+To install PFFT in a user specified directory \code{\$PFFTINSTDIR} call configure with the option
+\begin{lstlisting}
+./configure --prefix=$PFFTINSTDIR
+\end{lstlisting}
+However, this option is mandatory whenever you do not have root permissions on your machine, since the default install paths of 
+\code{configure} are not accessible by standard users.
+The PFFT library must be built with a MPI compiler. In Section~\ref{sec:fftw_inst} we already described how to hand the right compilers to the \code{configure} script.
+Some more options are
+\begin{compactitem}
+  \item \code[keywords=]{--enable-float}: Produces a single-precision version of PFFT (float) instead of the default double-precision (double); see \ref{sec:prec}.
+  \item \code[keywords=]{--enable-long-double}: Produces a long-double precision version of PFFT (long double) instead of the default double-precision (double); see \ref{sec:prec}.
+  \item \code{--disable-fortran}: Disables inclusion of Fortran wrapper routines in the standard PFFT libraries.
+  \item \code{--disable-tests}: Disables build of test programs.
+\end{compactitem}
+For more details on the options of the \code{configure} script call
+\begin{lstlisting}
+./configure --help
+\end{lstlisting}
+
+
+\section{How to include PFFT in your program}
+All programs using PFFT should include its header file
+\begin{lstlisting}
+#include <pfft.h>
+\end{lstlisting}
+This header includes the FFTW headers \code{fftw.h}, \code{fftw-mpi.h} automatically. Make sure that the compiler can find them by setting
+the include flags appropriately.
+You must also link to the PFFT, FFTW and FFTW-MPI libraries. On Unix, this means adding \code{-lpfft -lfftw3_mpi -lfftw3 -lm} at the end of the link command.
+For example, to build \code{pfft_test.c} use the following compiler invocation
+\begin{lstlisting}[prebreak = {\textbackslash}]
+mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm
+\end{lstlisting}
+Substitute \code{mpicc} by any other MPI C compiler if you like.
+\code{\$PFFTINC}, \code{\$FFTWINC}, \code{\$PFFTLIB}, and \code{\$FFTWLIB} denote the PFFT and FFTW include and library paths, respectively.
+If you use the install scripts mentioned in Sect.~\ref{sec:pfft-inst}, these paths will be
+\begin{lstlisting}[escapechar=§,numbers=none]
+PFFTINC = $HOME/local/pfft-§\pfftversion§/include
+FFTWINC = $HOME/local/fftw-§\fftwversion§/include
+PFFTINC = $HOME/local/pfft-§\pfftversion§/lib
+FFTWINC = $HOME/local/fftw-§\fftwversion§/lib
+\end{lstlisting}
+
+
diff --git a/doc/interface.rst b/doc/interface.rst
new file mode 100644
index 0000000..3bbfb58
--- /dev/null
+++ b/doc/interface.rst
@@ -0,0 +1,95 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Interface Layers of the PFFT Library}\label{chap:api}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+We give a quick overview of the PFFT interface layers in the order of increasing flexibility at the example of c2c-FFTs.
+For r2c-, c2r-, and r2r-FFT similar interface layer specifications apply. A full reference list of all PFFT functions is given in Chapter~\ref{chap:ref}. 
+\section{Basic Interface}
+The \code{_3d} interface is the simplest interface layer. It is suitable for the planning of three-dimensional FFTs.
+\begin{lstlisting}
+ptrdiff_t pfft_local_size_dft_3d(
+    const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+void pfft_local_block_dft_3d(
+    const ptrdiff_t *n, MPI_Comm comm_cart,
+    int pid, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+pfft_plan pfft_plan_dft_3d(
+    const ptrdiff_t *n,
+    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+    int sign, unsigned pfft_flags);
+\end{lstlisting}
+Hereby, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, and \code{local_o_start} are
+\code{ptrdiff_t} arrays of length \code{3}.
+
+The basic interface generalizes the \code{_3d} interface to FFTs of arbitrary dimension \code{rnk_n}.
+\begin{lstlisting}
+ptrdiff_t pfft_local_size_dft(
+    int rnk_n, const ptrdiff_t *n,
+    MPI_Comm comm_cart, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+void pfft_local_block_dft(
+    int rnk_n, const ptrdiff_t *n,
+    MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+pfft_plan pfft_plan_dft(
+    int rnk_n, const ptrdiff_t *n,
+    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+    int sign, unsigned pfft_flags);
+\end{lstlisting}
+Therefore, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, and \code{local_o_start} become
+arrays of length \code{rnk_n}.
+
+\section{Advanced Interface}
+The advanced interface introduces the arrays \code{ni} and \code{no} of length \code{rnk_n}
+that give the pruned FFT input and output size.
+Furthermore, the arrays \code{iblock} and \code{oblock} of length \code{rnk_pm} (\code{rnk_pm} being the dimension of the process mesh)
+serve to adjust the block size of the input and output block decomposition.
+The additional parameter \code{howmany} gives the number of transforms that will be computed simultaneously.
+\begin{lstlisting}
+ptrdiff_t pfft_local_size_many_dft(
+    int rnk_n, const ptrdiff_t *n,
+    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+    MPI_Comm comm_cart, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+void pfft_local_block_many_dft(
+    int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+    MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+pfft_plan pfft_plan_many_dft(
+    int rnk_n, const ptrdiff_t *n,
+    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+    int sign, unsigned pfft_flags);
+\end{lstlisting}
+
+
+\section{Preliminary: Skip Serial Transformations}\label{sec:skip-trafo}
+The \code{_skipped} interface extends the \code{_many} interface by adding the possibility to skip some of the serial FFTs.
+\begin{lstlisting}
+pfft_plan pfft_plan_many_dft_skipped(
+    int rnk_n, const ptrdiff_t *n,
+    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+    (red@const int *skip_trafos,@*)
+    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+    int sign, unsigned pfft_flags);
+\end{lstlisting}
+Hereby, \code{skip_trafos} is an \code{int} array of length \code{rnk_pm+1} (\code{rnk_pm} being the mesh dimension of the communicator \code{comm_cart}).
+For \code{t=0,...,rnk_pm} set \code{skip_trafos[t]=1} if the \code{t}-th serial transformation should be computed, otherwise set \code{skip_trafos[t]=0}.
+Note that the local transpositions are always performed, since they are a prerequisite for the global communication to work.
+At the moment it is only possible to skip the whole serial transform along the last \code{rnk_n-rnk_pm-1} dimensions.
+However, this behaviour can be realized by a call of a \code{(rnk_pm+1)}-dimensional PFFT with
+\begin{lstlisting}
+for(int t=rnk_pm+1; t<rnk_n; t++)
+  howmany *= n[t];
+\end{lstlisting}
+and manual computation of the desired serial transforms along the last \code{rnk_n-rnk_pm-1} dimensions.
diff --git a/doc/intro.rst b/doc/intro.rst
new file mode 100644
index 0000000..ec7ae97
--- /dev/null
+++ b/doc/intro.rst
@@ -0,0 +1,94 @@
+\abstract{
+This user manual describes the usage of PFFT~\pfftversion~\cite{pfft,Pi13}, a MPI-based, parallel software library for the
+computation of equispaced fast Fourier transforms (FFT) on parallel, distributed memory architectures.
+The reader of this manual should familiar with the basic usage of FFTW and MPI.
+For further information we refer to the well written FFTW user manual~\cite{fftw-manual} and
+the MPI Standard~\cite{MPI-2.2}, see also \cite{GrLuTh99} for detailed explanations.}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Introduction}\label{chap:intro}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+A popular software library for computing FFTs is FFTW~\cite{fftw, FFTW05}. This library also includes a parallel FFT implementation (FFTW-MPI) based on the Message Passing Interface (MPI).
+FFTW-MPI parallelizes multi-dimensional FFTs by a mixture of serial lower-dimensional FFTs and parallel data transpositions.
+However, FFTW-MPI makes use of a one-dimensional data decomposition, which shows to be a scalability bottleneck on large scale, parallel computers.
+For example, a three-dimensional FFT of size $1024^3$ can be computed with at most $1024$ MPI processes.
+In contrast, using a two-dimensional data decomposition would increase the maximum number of MPI processes to $1024^2$ in this case.
+
+The main goal of PFFT is to extend the MPI part of the FFTW software library to multi-dimensional data decompositions,
+i.e., $d$-dimensional FFTs of size $N^d$ can be computed in parallel with at most $N^{d-1}$ MPI processes.
+In addition, PFFT offers several extra features that are particular usefull for parallel, distributed memory FFTs but are not yet present in FFTW-MPI.
+We refer to the publication~\cite{Pi13} for a closer look on the different data decompositions and the underlying algorithms of the PFFT library.
+
+The interface of PFFT is as close as possible to the FFTW-MPI interface. 
+In fact, we consider every difference between PFFT and FFTW that is not explicitly mentioned within this manual as a bug that should be reported to \webpfft.
+Therefore, porting code that uses FFTW-MPI to PFFT is almost trivial, e.g. see Section~\ref{sec:porting}.
+
+Most features of PFFT are inherited from FFTW or similarily implemented. These include the following:
+\begin{compactitem}
+  \item We employ fast $\mathcal{O}(N\log N)$ algorithms of FFTW to compute arbitrary-size
+        discrete Fourier transforms of complex data, real data, and even- or odd-symmetric real data.
+  \item The dimension of the FFT can be arbitrary. However, parallel data decomposition must be at least one dimension smaller.
+  \item PFFT offers portable performance; e.g., it will perform well on most platforms.
+  \item The application of PFFT is split into a time consuming planning step and a high performance execution step.
+  \item Installing the library is easy. It is based on the common sequence of configure, make, and make install.
+  \item The interface of PFFT is very close to the MPI interface of FFTW.
+        In fact, we tried to add as few extra parameters as possible.
+  \item PFFT is written in C but also offers a Fortran interface, see Section~\ref{sec:fortran}.
+  \item FFTW includes shared memory parallelism for all serial transforms. This enables us to benefit from hybrid parallelism to a certain amount, see Section~\ref{sec:openmp}.
+  \item All steps of our parallel FFT can be performed completely in place. This is especially remarkable for the global
+        transposition routines.
+  \item Confirming to good MPI programming practice, all PFFT transforms can be performed on user defined communicators.
+        In other words, PFFT does not enforce the user to work with \verb+MPI_COMM_WORLD+.
+  \item PFFT uses the same algorithm to compute the size of the local array blocks as FFTW. This implies that the FFT size need not
+        be divisible by the number of processes.
+  \item PFFT supports single, double and long double precision.
+  \item PFFT supports new-array execution, i.e., a PFFT plan can be planned and executed on different plans up to some restrictions, see Section~\ref{sec:new-array} for details.
+        Thanks to Yu Feng for the new-array execute patch.
+\end{compactitem}
+Furthermore, we added some special features to support repeated tasks that often occur in practical application of parallel FFTs.
+\begin{compactitem}
+  \item PFFT includes a very flexible ghost cell exchange module. A detailed description of this module is given in Section~\ref{sec:gc}.
+  \item PFFT accepts three-dimensional data decomposition even for three-dimen\-sional FFTs.
+        However, the underlying parallel FFT framework is still based on two-dimensional decomposition. A more detailed description can be found
+        in Section~\ref{sec:3don2d}.
+  \item PFFT explicitly supports the parallel calculation of pruned FFTs. Details are given in Section~\ref{sec:pruned}.
+\end{compactitem}
+
+Finally, we complete this overview with a list of features that are (not yet) implemented in PFFT.
+\begin{compactitem}
+  \item Parallel one-dimensional FFT based on MPI. FFTW-MPI uses another parallelization strategy for one-dimensional FFTs, which is not implemented in PFFT.
+        The reason is that we can not achive a scalability benefit due to higher dimensional data decomposition if the FFT has only one dimension.
+        Therefore, one can also call FFTW directly in this case.
+  \item There is no equivalent of FFTW \emph{wisdom} in PFFT, i.e., you can not save a PFFT plan to disk and restore it for later use.
+  \item PFFT does not have full OpenMP support. All serial FFT computations and global communications are implemented with FFTW,
+        which offers OpenMP support, see Section~\ref{sec:openmp}. However, most of the PFFT-only features, such as pruned FFT, ghost cell send and 3d decompostion of 3d FFTs are not yet parallelized with OpenMP.
+  \item PFFT does not have full SIMD support. All serial FFT computations and global communications are implemented with FFTW,
+        which offers SIMD support, see Section~\ref{sec:simd}. However, most of the PFFT-only features, such as pruned FFT, ghost cell send and 3d decompostion of 3d FFTs are not yet parallelized with SIMD.
+  \item PFFT does not overlap communication and computation. The code of PFFT is build in a very modularized structure. Most of these modules consist
+        of FFTWs routines. Therefore, the global transposition does not support non blocking communication.
+  \item Similar to FFTW, we do not provide any parallel IO routines. The user is responsible of load and store of parallel data.
+  \item PFFT depends on FFTW to perform its serial transforms and does not support different vendor FFTs (such as Intel's MKL or IBM's ESSL).
+        However, this is not assumed to be a big drawback, since FFTW seems to perform very well on most platforms.
+%         We apply FFTW to multi-dimensional data sets in order to compute serial FFTs along single dimensions combined with transpositions of the multi-dimensional data in one step.
+%         As far as we know, there is no other FFT library that performs these two tasks. In addition, we use FFTW for local and global, i.e. serial and parallel, data transpositions.
+%         Thereby, changing the FFT vendor would affect only a 
+%         However, this is not assumed to be a big drawback, since FFTW seems to perform very well on most platforms.
+  \item The global communication routines can not be called separately. However, it should be possible to implement a user interface to our global
+        transposition routines.
+  \item PFFT does not support GPU parallelization.
+\end{compactitem}
+You are welcome to propose new PFFT features at \webpfft.
+
+\section{Alternative parallel FFT implementations}
+There have been several FFT implementations that aim to circumvent the scalability bottleneck
+for at least three dimensional FFTs by using two-dimensional decomposition approach.
+However, these implementations are often fitted to special problems and where not published
+as a stand alone software library. 
+Remarkable exceptions are the parallel FFT software library by S.~Plimpton~\cite{Pl97,sandiafft},
+the P3DFFT software library by D.~Pekurovsky~\cite{Pe12,p3dfft} and the \mbox{2DECOMP\&FFT} software library by N.~Li~\cite{Li2010, 2decompfft}.
+
+\section{Parallel nonequispaced FFT}
+If your are interested in a parallel implementation of nonequispaced fast Fourier
+transforms (NFFT) for distributed memory architectures, you should have a look at our PNFFT software library~\cite{pnfft, PiPo13}
+that is also available at \webpnfft.
+
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
new file mode 100644
index 0000000..28fcf9f
--- /dev/null
+++ b/doc/tutorial.rst
@@ -0,0 +1,437 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Tutorial}\label{chap:tuto}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+The following chapter describes the usage of the PFFT library at the example of a simple test file in the first section,
+followed by the more advanced features of PFFT in the next sections.
+
+\section{A first parallel transform - Three-dimensional FFT with two-dimensional data decomposition}
+We explain the basic steps for computing a parallel FFT with the PFFT library at the example
+of the short test program given by Listing~\ref{lst:man_c2c}. This test computes a three-dimensional c2c-FFT on
+a two-dimensional process mesh. The source code \code{manual_c2c_3d.c} can be found in directory \code{tests/}
+of the library's source code tree. 
+\lstinputlisting[numbers=left, float, caption={Minimal parallel c2c-FFT test program.}, label=lst:man_c2c]{../tests/manual_c2c_3d.c}
+
+After initializing MPI with \code{MPI_Init} and before calling any other PFFT routine initialize
+the parallel FFT computations via
+\begin{lstlisting}
+void pfft_init(void);
+\end{lstlisting}
+MPI introduces the concept of communicators to store all the topological information of the physical process layout.
+PFFT requires to be called on a process mesh that corresponds to a periodic, Cartesian communicator.
+We assist the user in creating such a communicator with the following routine
+\begin{lstlisting}
+int pfft_create_procmesh_2d(
+    MPI_Comm comm, int np0, int np1,
+    MPI_Comm *comm_cart_2d);
+\end{lstlisting}
+This routine uses the processes within the communicator \code{comm} to create a two-dimensional process
+grid of size \code{np0} x \code{np1} and stores it into the Cartesian communicator \code{comm_cart_2d}.
+Note that \code{comm_cart_2d} is allocated by the routine and must be freed with \code{MPI_Comm_free} after usage.
+The input parameter \code{comm} is a communicator, indicating which processes will participate in the transform.
+Choosing \code{comm} as \code{MPI_COMM_WORLD} implies that the FFT is computed on all available processes.
+
+At the next step we need to know the data decomposition of the input and output array, that depends on
+the array sizes, the process grid and the chosen parallel algorithm. Therefore, we call
+\begin{lstlisting}
+ptrdiff_t pfft_local_size_3d(
+    ptrdiff_t *n, MPI_Comm comm_cart_2d, unsigned pfft_flags,
+    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+\end{lstlisting}
+Hereby, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, \code{local_o_start} are arrays of length $3$ that must be allocated.
+The return value of this function equals the size of the local complex array that needs to be allocated by every process.
+In most cases, this coincides with the product of the local array sizes -- but may be bigger,
+whenever the parallel algorithm needs some extra storage.
+The input value \code{n} gives the three-dimensional FFT size and the flag \code{pfft_flags} serves to adjust
+some details of the parallel execution. For the sake of simplicity, we restrict our self to the case
+\code{pfft_flags=PFFT_TRANSPOSED_NONE} for a while and explain the more sophisticated flags at a later point.
+The output arrays \code{local_ni} and \code{local_i_start} give the size and the offset of the local input array
+that result from the parallel block distribution of the global input array, i.e.,
+every process owns the input data \code{in[k[0],k[1],k[2]]} with \code{local_i_start[t] <= k[t] < local_i_start[t] + local_ni[t]}
+for \code{t=0,1,2}. Analogously, the output parameters \code{local_o_start} and \code{local_no} contain the size
+and the offset of the local output array.
+
+Afterward, the input and output arrays must be allocated. Hereby,
+\begin{lstlisting}
+pfft_complex* pfft_alloc_complex(size_t size);
+\end{lstlisting}
+is a simple wrapper of \code{fftw_alloc_complex}, which in turn allocates the memory via \code{fftw_malloc} to ensure proper alignment for SIMD.
+Have a look at the FFTW user manual~\cite{fftw-align-mem} for more details on SIMD memory alignment and \code{fftw_malloc}.
+Nevertheless, you can also use any other dynamic memory allocation.
+
+The planning of a single three-dimensional parallel FFT of size \code{n[0]} x \code{n[1]} x \code{n[2]}
+is done by the function
+\begin{lstlisting}
+pfft_plan pfft_plan_dft_3d(
+    ptrdiff_t *n, pfft_complex *in, pfft_complex *out,
+    MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags);
+\end{lstlisting}
+We provide the address of the input and output array by the pointers \code{in} and \code{out},
+respectively. An inplace transform is assumed if these pointers are equal.
+The integer \code{sign} gives the sign in the exponential of the FFT. Possible values are \code{PFFT_FORWARD} ($-1$)
+and \code{PFFT_BACKWARD} ($+1$).
+Flags passed to the planner via \code{pfft\_flags} must coincide with the flags that were passed to \code{pfft_local_size_3d}.
+Otherwise the data layout of the parallel execution may not match calculated local array sizes.
+As return value we get a PFFT plan, some structure that stores all the information needed to perform a parallel FFT.
+
+Once the plan is generated, we are allowed to fill the input array \code{in}. Note, that per default the planning step
+\code{pfft_plan_dft_3d} will overwrite input array \code{in}. Therefore, you should not write any sensitive data into \code{in} until the plan was generated.
+For simplicity, our test program makes use of the library function
+\begin{lstlisting}
+void pfft_init_input_complex_3d(
+    ptrdiff_t *n, ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+    pfft_complex *in);
+\end{lstlisting}
+to fill the input array with some numbers. Alternatively, one can fill the array with a function \code{func} of choice
+and the following loop that takes account of the parallel data layout
+\begin{lstlisting}
+ptrdiff_t m=0;
+for(ptrdiff_t k0=0; k0 < local_ni[0]; k0++)
+  for(ptrdiff_t k1=0; k1 < local_ni[1]; k1++)
+    for(ptrdiff_t k2=0; k2 < local_ni[2]; k2++)
+      in[m++] = func(k0 + local_i_start[0],
+                     k1 + local_i_start[1],
+                     k2 + local_i_start[2]);
+\end{lstlisting}
+The parallel FFT is computed when we execute the generated plan via
+\begin{lstlisting}
+void pfft_execute(const pfft_plan plan);
+\end{lstlisting}
+Now, the results can be read from \code{out} with an analogous three-dimensional loop.
+If we do not want to execute another parallel FFT of the same type, we free the allocated memory of the plan with
+\begin{lstlisting}
+void pfft_destroy_plan(pfft_plan plan);
+\end{lstlisting}
+Additionally, we use
+\begin{lstlisting}
+int MPI_Comm_free(MPI_Comm *comm);  
+\end{lstlisting}
+to free the communicator allocated by \code{pfft_create_procmesh_2d} and
+\begin{lstlisting}
+void pfft_free(void *ptr);
+\end{lstlisting}
+to free memory allocated by \code{pfft_alloc_complex}.
+Finally, we exit MPI via
+\begin{lstlisting}
+int MPI_Finalize(void);
+\end{lstlisting}
+
+
+\section{Porting FFTW-MPI based code to PFFT}\label{sec:porting}
+\todo[inline]{finish FFTW2PFFT porting example}
+We illustrate the close connection between FFTW-MPI and PFFT at a three-dimensional MPI example analogous to the example given in the FFTW manual~\cite{fftw-2dmpi}.
+\lstinputlisting[numbers=left, float, caption={Minimal parallel c2c-FFT test program.}, label=lst:fftw_3don1d]{man_fftw_3don1d.tex}
+
+Exactly the same task can be performed with PFFT as given in Listing~\ref{lst:pfft_3don1d}.
+\begin{lstlisting}
+#include <pfft.h>
+     
+int main(int argc, char **argv)
+{
+    const ptrdiff_t n[3] = {..., ..., ...};
+    pfft_plan plan;
+    pfft_complex *data;
+    ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+    unsigned pfft_flags = 0;
+
+    MPI_Init(&argc, &argv);
+    pfft_init();
+
+    /* get local data size and allocate */
+    alloc_local = pfft_local_size_dft_3d(n, MPI_COMM_WORLD, pfft_flags,
+				         local_ni, local_i_start,
+				         local_no, local_o_start);
+    data = pfft_alloc_complex(alloc_local);
+
+    /* create plan for in-place forward DFT */
+    plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+			    PFFT_FORWARD, PFFT_ESTIMATE);
+
+    /* initialize data to some function my_function(x,y,z) */
+    for (i = 0; i < local_n[0]; ++i) 
+      for (j = 0; j < n[1]; ++j) 
+        for (k = 0; k < n[2]; ++k)
+          data[i*n[1]*n[2] + j*n[2] + k] = my_function(local_i_start[0] + i, j, k);
+
+    /* compute transforms, in-place, as many times as desired */
+    pfft_execute(plan);
+
+    pfft_destroy_plan(plan);
+
+    MPI_Finalize();
+}
+\end{lstlisting}
+
+
+
+\begin{compactitem}
+  \item substitute \code{fftw3-mpi.h} by \code{pfft.h}
+  \item substitute all prefixes \code{fftw_} and \code{fftw_mpi_} by \code{pfft_}
+  \item substitute all prefixes \code{FFTW_} by \code{PFFT_}
+  \item the integers \code{N}, \code{local_n0}, \code{local_0_start} become arrays of length 3
+  \item \code{dft_} in \code{pfft_local_size_dft_3d}
+  \item \code{pfft_local_size_dft_3d} has additional input \code{pfft_flags} and additional outputs \code{local_no}, \code{local_o_start}
+  \item The loop that inits \code{data} becomes splitted along all three dimensions. We could also use 
+  
+  
+\end{compactitem}
+
+
+First, All prefixes \code{fftw_} are substituted by \code{pfft_}
+
+Now, the changes in order to use a two-dimensional process mesh are marginal as can be seen in Listing~\ref{lst:pfft_3don2d}.
+\begin{lstlisting}
+#include <pfft.h>
+     
+int main(int argc, char **argv)
+{
+    const ptrdiff_t n[3] = {..., ..., ...};
+    (red@const int np0 = ..., np1 = ...;@*)
+    pfft_plan plan;
+    pfft_complex *data;
+    ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+    unsigned pfft_flags = 0;
+    (red@MPI_Comm comm_cart_2d;@*)
+
+    MPI_Init(&argc, &argv);
+    pfft_init();
+
+    (red@/* create two-dimensional process grid of size np0 x np1 */@*)
+    (red@pfft_create_procmesh_2d(MPI_COMM_WORLD, np0, np1,@*)
+        (red@&comm_cart_2d);@*)
+    
+    /* get local data size and allocate */
+    alloc_local = pfft_local_size_dft_3d(n, (red@comm_cart_2d@*), pfft_flags,
+				         local_ni, local_i_start,
+				         local_no, local_o_start);
+    data = pfft_alloc_complex(alloc_local);
+
+    /* create plan for in-place forward DFT */
+    plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+			    PFFT_FORWARD, PFFT_ESTIMATE);
+
+    /* initialize data to some function my_function(x,y,z) */
+    for (i = 0; i < local_n[0]; ++i) 
+      for (j = 0; j < (red@local_n[1]@*); ++j) 
+        for (k = 0; k < (red@local_n[2]@*); ++k)
+          data[i*(red@local_n[1]*local_n[2]@*) + j*(red@local_n[2]@*) + k] =
+              my_function(local_i_start[0] + i,
+		          (red@local_i_start[1] +@*) j,
+		          (red@local_i_start[2] +@*) k);
+
+    /* compute transforms, in-place, as many times as desired */
+    pfft_execute(plan);
+
+    pfft_destroy_plan(plan);
+
+    MPI_Finalize();
+}
+\end{lstlisting}
+
+
+
+
+
+
+
+\section{Errorcode for communicator creation}
+As we have seen the function
+\begin{lstlisting}
+int pfft_create_procmesh_2d(
+    MPI_Comm comm, int np0, int np1,
+    MPI_Comm *comm_cart_2d);
+\end{lstlisting}
+creates a two-dimensional, periodic, Cartesian communicator. The \code{int} return value
+(not used in Listing~\ref{lst:man_c2c}) is the forwarded error code of \code{MPI_Cart_create}.
+It is equal to zero if the communicator was created successfully.
+The most common error is that the number of processes within the input
+communicator \code{comm} does not fit \code{np0 x np1}. In this case the Cartesian communicator
+is not generated and the return value is unequal to zero. Therefore, a typical sanity check might look like
+\begin{lstlisting}
+/* Create two-dimensional process grid of size np[0] x np[1],
+   if possible */
+if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1],
+        &comm_cart_2d) )
+{
+  pfft_fprintf(MPI_COMM_WORLD, stderr,
+      "Error: This test file only works with %d processes.\n",
+      np[0]*np[1]);
+  MPI_Finalize();
+  return 1;
+}
+\end{lstlisting}
+Hereby, we use the PFFT library function
+\begin{lstlisting}
+void pfft_fprintf(
+    MPI_Comm comm, FILE *stream, const char *format, ...);
+\end{lstlisting}
+to print the error message.
+This function is similar to the standard C function \code{fprintf} with the exception, that only the process with MPI rank $0$
+within the given communicator \code{comm} will produce some output; see Section~\ref{sec:fprintf} for details.
+
+\section{Inplace transforms}
+Similar to FFTW, PFFT is able to compute parallel FFTs completely in place, which means that beside some
+constant buffers, no second data array is necessary. Especially, the global data communication
+can be performed in place. As far as we know, there is no other parallel FFT library beside FFTW and PFFT that
+supports this feature.
+This feature is enabled as soon as the pointer to the output array \code{out} is equal to the pointer to the input array \code{in}.
+E.g., in Listing~\ref{lst:man_c2c} we would call
+\begin{lstlisting}[firstnumber=34]
+/* Plan parallel forward FFT */
+plan = pfft_plan_dft_3d(n, in, in, comm_cart_2d,
+    PFFT_FORWARD, PFFT_TRANSPOSED_NONE);
+\end{lstlisting}
+
+\section{Higher dimensional data decomposition}
+The test program given in Listing~\ref{lst:man_c2c} used a two-dimensional data decomposition of a three-dimensional data set.
+Moreover, PFFT support the computation of any $d$-dimensional FFT with $r$-dimensional data decomposition
+as long as $r\le d-1$. For example, one can use a one-dimensional data decomposition for any two- or higher-dimensional data set,
+while the data set must be at least four-dimensional to fit to a three-dimensional data decomposition.
+The case $r=d$ is not supported efficiently, since during the parallel computations
+there is always at least one dimension that remains local, i.e., one dimensions stays non-decomposed.
+The only exception from this rule is the case $d=r=3$ that is supported by PFFT in a special way, see Section~\ref{sec:3don3d} for details.
+
+The dimensionality of the data decomposition is given by the dimension of the Cartesian communicator that
+goes into the PFFT planing routines. Therefore, we present a generalization of communicator creation function
+\begin{lstlisting}
+int pfft_create_procmesh(
+    int rnk_np, MPI_Comm comm, const int *np,
+    MPI_Comm *comm_cart);
+\end{lstlisting}
+Hereby, the array \code{np} of length \code{rnk_np} gives the size of the Cartesian communicator \code{cart_comm}.
+
+\section{Parallel data decomposition}\label{sec:par-data-decomp}
+In the following, we use the notation $\frac{n}{P}$ to symbolize that an array of length $n$ is broken into disjoint blocks and distributed on $P$ MPI processes.
+Hereby, the data is distributed in compliance to the FFTW-MPI data decompostion~\cite{fftw-mpi-data-distribution},
+i.e., the first \code{P/block} (rounded down) processes get a contiguous chunk of \code{block} elements,
+the next process gets the remaining \code{n - block * (n/block)} data elements, and all remaining processes get nothing.
+Thereby, the block size \code{block} defaults to \code{n/P} (rounded down) but can also be user defined.
+
+\subsection{Non-transposed and transposed data layout}
+In the following, we use the notation $\frac{n}{P}$ to symbolize that an array of length $n$ is distributed on $P$ MPI processes.
+The standard PFFT data decomposition of $h$ interleaved $d$-dimensional arrays of equal size $n_0 \times n_1\times \hdots \times n_{d-1}$
+on a $r$-dimensional process mesh of size $P_0\times \hdots \times P_{r-1}$ is given by the blocks
+\begin{equation*}
+  \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \hdots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \hdots \times n_{d-1} \times h.
+\end{equation*}
+A PFFT created with planning flag \code{PFFT_TRANSPOSED_NONE} requires the inputs to be decomposed in this standard way and produces
+outputs that are decomposed in the same way.
+
+PFFT can save half of the global communication amount, if the data reordering to standard decomposition is omitted. 
+The transposed data decomposition is given by
+\begin{equation*}
+  \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \hdots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \hdots \times n_{d-1} \times h
+\end{equation*}
+A PFFT plan created with planning flag \code{PFFT_TRANSPOSED_OUT} produces outputs with transposed data decomposition.
+Analogously, a PFFT plan created with planning flag \code{PFFT_TRANSPOSED_IN} requires its inputs to be decomposed in the transposed way.
+Typically, one creates a forward plan with \code{PFFT_TRANSPOSED_OUT} and a backward plan with planning flag \code{PFFT_TRANSPOSED_IN}.
+
+Note that the flags \code{PFFT_TRANSPOSED_OUT} and \code{PFFT_TRANSPOSED_IN} must be passed to the array distribution function (see Section~\ref{sec:local-size})
+\emph{as well as} to the planner (see Section~\ref{sec:create-plan}).
+
+
+\subsection{Three-dimensional FFTs with three-dimensional data decomposition}\label{sec:3don3d}
+Many applications work with three-dimensional block decompositions of three-dimensional arrays.
+PFFT supports decompositions of the kind
+\begin{equation*}
+  \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \frac{n_2}{P_2} \times h.
+\end{equation*}
+However, PFFT applies a parallel algorithms that needs at least one non-distributed transform dimension (we do not transform along $h$),
+Therefore, we split the number of processes along the last dimension into two factors $P_2=Q_1Q_2$, remap
+the data to the two-dimensional decomposition
+\begin{equation*}
+  \frac{n_0}{P_0Q_0} \times \frac{n_1}{P_1Q_1} \times n_2 \times h,
+\end{equation*}
+and compute the parallel FFT with this two-dimensional decomposition.
+Note that the 3d to 2d remap implies some very special restrictions on the block sizes for $n_0$ and $n_1$, i.e.,
+the blocks must be divisible by $Q_0$ and $Q_1$. More precisely, the default blocks of the 2d-decomposition
+are given by \code{n0/(P0*Q0)} and \code{n1/(P1*Q1)} (both divisions rounded down).
+This implies that the default blocks of the 3d-decomposition must be \code{n0/(P0*Q0) * Q0},
+\code{n1/(P1*Q1) * Q1}, and \code{n2/(Q0*Q1)} (all divisions rounded down).
+
+
+\section{Planning effort}
+Pass one of the following flags
+\begin{compactitem}
+  \item \code{PFFT_ESTIMATE},
+  \item \code{PFFT_MEASURE},
+  \item \code{PFFT_PATIENT}, or,
+  \item \code{PFFT_EXHAUSIVE}
+\end{compactitem}
+to the PFFT planner in order to plan all internal FFTW plans with \code{FFTW_ESTIMATE}, \code{FFTW_MEASURE}, \code{FFTW_PATIENT}, or \code{FFTW_EXHAUSIVE},
+respectively. The default value is \code{PFFT_MEASURE}.
+
+PFFT uses FFTW plans for parallel array transposition and the serial transforms. In fact, every serial transform is a combination of
+strided lower-dimensional FFTs and a serial array transposition (necessary to prepare the global transposition) which can be done by a single FFTW plan.
+However, it turns out that FFTW sometimes performs better if the serial transposition and the strided FFTs are executed separately.
+Therefore, PFFT introduces the flag \code{PFFT_TUNE} that enables extensive run time tests in order to find the optimal sequence of
+serial strided FFT and serial transposition for every serial transform. These tests are disable on default which corresponds to the flag \code{PFFT_NO_TUNE}.
+
+\section{Preserving input data}
+The following flags
+\begin{compactitem}
+  \item \code{PFFT_PRESERVE_INPUT},
+  \item \code{PFFT_DESTROY_INPUT}, and,
+  \item \code{PFFT_BUFFERED_INPLACE}
+\end{compactitem}
+only take effect for out-of-place transforms.
+The first one behaves analogously to the FFTW flag \code{FFTW_PRESERVE_INPUT} and ensures that the input values are not overwritten.
+In fact, this flag implies that only the first serial transform is executed out-of-place and all
+successive steps are performed in-place on the output array.
+In compliance to FFTW, this is the default behaviour for out-of-place plans.
+
+The second flag behaves analogously to the FFTW flag \code{FFTW_DESTROY_INPUT} and tells the planner that
+the input array can be used as scratch array. This may give some speedup for out-of-place plans,
+because all the intermediate transforms and transposition steps can be performed out-of-place.
+
+Finally, the flag \code{PFFT_BUFFERED_INPLACE} can be used for out-of-place plans that store its inputs and outputs in the same array,
+i.e., array \code{out} is used for intermediate out-of-place transforms and transpositions but the PFFT inputs and outputs are stored in array \code{in}.
+
+
+\section{FFTs with shifted index sets}
+\todo[inline]{Describe shifted input and output}
+\begin{compactitem}
+  \item \code{PFFT_SHIFTED_IN}
+  \item \code{PFFT_SHIFTED_OUT}
+\end{compactitem}
+
+\section{Pruned FFT and Shifted Index Sets}
+\todo[inline]{Describe pruned FFT with shifted input and output}
+\subsection{Pruned FFT}
+For pruned r2r- and c2c-FFT are defined as
+\begin{equation*}
+  g_l = \sum_{k=0}^{n_i-1} \hat g_k \eim{kl/n}, \quad l=0,\hdots,n_o-1,
+\end{equation*}
+where $n_i\le n$ and $n_o\le n$.
+
+\subsection{Shifted Index Sets}
+For $N\in 2\N$ we define the FFT with shifted inputs
+
+
+For $K,L,N\in 2\N$, $L<N$, $L<N$ we define
+
+
+
+
+
+\section{Precisions}\label{sec:prec}
+PFFT handles multiple precisions exactly in the same way as FFTW. Therefore, we quote part~\cite{fftw-prec} of the FFTW manual in the context of PFFT:
+
+You can install single and long-double precision versions of PFFT, which replace double with float and long double, respectively; see \ref{sec:install}.
+To use these interfaces, you must
+\begin{compactitem}
+  \item Link to the single/long-double libraries; on Unix, \code{-lpfftf} or \code{-lpfftl} instead of (or in addition to) \code{-lpfft}.
+        (You can link to the different-precision libraries simultaneously.)
+  \item Include the same \code{<pfft.h>} header file.
+  \item Replace all lowercase instances of ‘\code{pfft_}’ with ‘\code{pfftf_}’ or ‘\code{pfftl_}’ for single or long-double precision, respectively.
+        (\code{pfft_complex} becomes \code{pfftf_complex}, \code{pfft_execute} becomes \code{pfftf_execute}, etcetera.)
+  \item Uppercase names, i.e. names beginning with ‘\code{PFFT_}’, remain the same.
+  \item Replace \code{double} with \code{float} or \code{long double} for subroutine parameters.
+\end{compactitem}
+
+\section{Ghost cell communication}
+\todo[inline]{explain ghost cell communication with a test file}
+
+\section{Fortran interface}
+\todo[inline]{explain F03 interface with a test file}

From 0830f318c5de73132080e044bb6819544fa19541 Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 13 Sep 2015 01:43:47 -0700
Subject: [PATCH 3/6] Convert to rst with pandoc

This is at least somewhat comprehensible.
---
 doc/develop.rst   |   50 +-
 doc/features.rst  |  291 +++++----
 doc/fortran.rst   |    5 +-
 doc/install.rst   |  232 ++++---
 doc/interface.rst |  212 ++++---
 doc/intro.rst     |  245 ++++---
 doc/reference.rst | 1550 +++++++++++++++++++++++++++++++++++++++++++++
 doc/tutorial.rst  |  910 ++++++++++++++------------
 8 files changed, 2634 insertions(+), 861 deletions(-)
 create mode 100644 doc/reference.rst

diff --git a/doc/develop.rst b/doc/develop.rst
index a728ce8..60781d0 100644
--- a/doc/develop.rst
+++ b/doc/develop.rst
@@ -1,42 +1,38 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Developers Guide}\label{chap:develop}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Developers Guide
+================
 
-
-
-\section{Search and replace patterns}
+Search and replace patterns
+---------------------------
 
 Correct alignment of pfft.h header
-\begin{lstlisting}
-%s/^\(    [^ ]\+[^\\]*\)  \\/  \1\\/g  
-\end{lstlisting}
 
-Expand most macros of pfft.h to generate the function reference of this manual:
-\begin{lstlisting}[language=bash,prebreak=\textbackslash,]
-sed -e 's/ *\\$//g' -e 's/PFFT_EXTERN //g' \
-    -e 's/PX(\([^)]*\))/pfft_\1/g' -e 's/ INT/ ptrdiff_t/g' \
-    -e 's/ R/ double/g' -e 's/ C/ pfft_complex/g' \
-    -e 's/^  //g' pfft.h > pfft.h.expanded
-\end{lstlisting}
+::
 
+    %s/^\(    [^ ]\+[^\\]*\)  \\/  \1\\/g  
 
+Expand most macros of pfft.h to generate the function reference of this
+manual:
 
+::
 
+    sed -e 's/ *\\$//g' -e 's/PFFT_EXTERN //g' \
+        -e 's/PX(\([^)]*\))/pfft_\1/g' -e 's/ INT/ ptrdiff_t/g' \
+        -e 's/ R/ double/g' -e 's/ C/ pfft_complex/g' \
+        -e 's/^  //g' pfft.h > pfft.h.expanded
 
+ToDo
+====
 
+-  is defined as
 
+-  is defined as :math:`-1`
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{ToDo}\label{chap:todo}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-  PFFT allows to chose between and , which is not implemented by FFTW.
 
-\begin{itemize}
-  \item \code{PFFT_FORWARD} is defined as \code{FFTW_FORWARD}
-  \item \code{FFTW_FORWARD} is defined as $-1$
-  \item PFFT allows to chose between \code{FFTW_FORWARD} and \code{FFTW_BACKWARD}, which is not implemented by FFTW.
-  \item Matlab uses the same sign convention, i.e., $-1$ for \code{fft} and $+1$ for \code{ifftn}
-\end{itemize}
+-  Matlab uses the same sign convention, i.e., :math:`-1` for and
+   :math:`+1` for
 
-\section{Measuring parallel run times}
-Use \code{MPI_Barrier} in front of every call to \code{pfft_} function to avoid unbalanced run times.
+Measuring parallel run times
+----------------------------
 
+Use in front of every call to function to avoid unbalanced run times.
diff --git a/doc/features.rst b/doc/features.rst
index 13c48a1..417abf8 100644
--- a/doc/features.rst
+++ b/doc/features.rst
@@ -1,161 +1,174 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Advanced Features}\label{chap:feat}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-%------------------------------------------------------------------------------
-\section{How to Deal with FFT Index Shifts in Parallel}
-%------------------------------------------------------------------------------
-Let $n\in2\N$. A common problem is that the index of the FFT input and/or output array runs between $-\nicefrac n2,\hdots,\nicefrac n2-1$,
-but the FFT library requires them to run between $0,\hdots,n-1$. With serial program execution one can easily remap the input data $\hat g_k$
-in a way that is suitable for the library, i.e.,
-\begin{equation*}
-  \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\hdots,n-1.
-\end{equation*}
-Similarly, one could remap the outputs of the library $f_l$, $l=0,\cdots,n-1$ in the opposite direction in order to get the
+Advanced Features
+=================
+
+How to Deal with FFT Index Shifts in Parallel
+---------------------------------------------
+
+Let :math:`n\in2\N`. A common problem is that the index of the FFT input
+and/or output array runs between
+:math:`-\nicefrac n2,\hdots,\nicefrac n2-1`, but the FFT library
+requires them to run between :math:`0,\hdots,n-1`. With serial program
+execution one can easily remap the input data :math:`\hat g_k` in a way
+that is suitable for the library, i.e.,
+
+.. math:: \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\hdots,n-1.
+
+Similarly, one could remap the outputs of the library :math:`f_l`,
+:math:`l=0,\cdots,n-1` in the opposite direction in order to get the
 required outputs, i.e.,
-\begin{equation*}
-  g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\hdots,\nicefrac n2-1.
-\end{equation*}
-These shifts are also known as \code{fftshift} in Matlab.
 
-However, with distributed memory these \code{fftshift} operations require more complex data movements and result in a global communication.
-For example, the first index of the array moves to the middle and, therefore, the corresponding data move to another MPI process.
-Fortunately, this communication can be avoided at the cost of little extra computation.
-At the end of the section we present two PFFT library functions that perform the necessary pre- and postprocessing
-for shifted input and output index sets.
+.. math:: g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\hdots,\nicefrac n2-1.
 
-\subsection{Shift with half the FFT size}
+These shifts are also known as in Matlab.
 
-The special case of input shift $k_s=-\nicefrac n2$ and/or output shift $l_s=-\nicefrac n2$ is supported by PFFT.
-User can choose to shift the input (\verb+PFFT_SHIFTED_IN+) and/or to shift the output (\verb+PFFT_SHIFTED_OUT+).
-\todo{this flag can be used for \code{local_size} and planning}
+However, with distributed memory these operations require more complex
+data movements and result in a global communication. For example, the
+first index of the array moves to the middle and, therefore, the
+corresponding data move to another MPI process. Fortunately, this
+communication can be avoided at the cost of little extra computation. At
+the end of the section we present two PFFT library functions that
+perform the necessary pre- and postprocessing for shifted input and
+output index sets.
+
+Shift with half the FFT size
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The special case of input shift :math:`k_s=-\nicefrac n2` and/or output
+shift :math:`l_s=-\nicefrac n2` is supported by PFFT. User can choose to
+shift the input (``PFFT_SHIFTED_IN``) and/or to shift the output
+(``PFFT_SHIFTED_OUT``).
 
 Here, we are interested in the computation of
-\begin{equation*}
-  g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1  
-\end{equation*}
-with $n, n_i, n_o \in 2\N$ and $n>n_i$, $n>n_o$.
-
-With an index shift of $\nicefrac n2$ both in $k$ and $l$ this equivalent to the computation of
-\begin{align*}
-  g_{(l-\nicefrac{n}{2})}
-  &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-     \hat g_{(k-\nicefrac{n}{2})} \eim{(k-\nicefrac n2)(l-\nicefrac n2)/n} \\
-  &= \e^{+\pi\ti l} 
-       \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-       \left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti (k-\nicefrac n2)}\right) \eim{kl/n} \\
-  &= \e^{+\pi\ti(l-\nicefrac n2)} 
-     \underbrace{
-       \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-       \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti k}\right)}_{\hat f_k} \eim{kl/n}
-     }_{f_l}
-\end{align*}
-for $ l=\nicefrac n2-\nicefrac{n_o}{2},\hdots,\nicefrac n2 +\nicefrac{n_o}{2}-1$.
-Therefore, we get the following algorithm
 
-\begin{equation*}
-  f_l = \sum_{k=0}^n \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1  
-\end{equation*}
+.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1
 
-The special case $k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}$ corresponds to the shifts the arrays (\textsf{FFTSHIFT})
-\begin{algorithm}
-  \begin{algorithmic}[1]
-    \itemsep=1.1ex
-    \State For $k=0,\hdots,n-1$ set $\hat f_k = 0$.
-    \State For $k=-\nicefrac{n_i}{2},\hdots,\nicefrac{n_i}{2}-1$ compute $\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}$.
-    \State For $l=0,\hdots,n-1$ compute $f_l = \sum_{k=0}^{n} \hat f_k \eim{kl/n}$ using PFFT.
-    \State For $l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1$ compute $g_l = (-1)^l f_{(l+n/2)} $.
-  \end{algorithmic}
-\end{algorithm}
+with :math:`n, n_i, n_o \in 2\N` and :math:`n>n_i`, :math:`n>n_o`.
 
+With an index shift of :math:`\nicefrac n2` both in :math:`k` and
+:math:`l` this equivalent to the computation of
 
-Note, that this shift implies that the library deals with pruned FFTs in a special way, i.e., half of the zeros are added
-at the beginning of the inputs and the other half is added at the end.
+.. math::
 
+   \begin{aligned}
+     g_{(l-\nicefrac{n}{2})}
+     &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+        \hat g_{(k-\nicefrac{n}{2})} \eim{(k-\nicefrac n2)(l-\nicefrac n2)/n} \\
+     &= \e^{+\pi\ti l} 
+          \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+          \left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti (k-\nicefrac n2)}\right) \eim{kl/n} \\
+     &= \e^{+\pi\ti(l-\nicefrac n2)} 
+        \underbrace{
+          \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
+          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti k}\right)}_{\hat f_k} \eim{kl/n}
+        }_{f_l}\end{aligned}
 
+for
+:math:` l=\nicefrac n2-\nicefrac{n_o}{2},\hdots,\nicefrac n2 +\nicefrac{n_o}{2}-1`.
+Therefore, we get the following algorithm
 
+.. math:: f_l = \sum_{k=0}^n \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1
 
+The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
+corresponds to the shifts the arrays ()
 
+[1] =1.1ex For :math:`k=0,\hdots,n-1` set :math:`\hat f_k = 0`. For
+:math:`k=-\nicefrac{n_i}{2},\hdots,\nicefrac{n_i}{2}-1` compute
+:math:`\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}`.
+For :math:`l=0,\hdots,n-1` compute
+:math:`f_l = \sum_{k=0}^{n} \hat f_k \eim{kl/n}` using PFFT. For
+:math:`l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1` compute
+:math:`g_l = (-1)^l f_{(l+n/2)} `.
+
+Note, that this shift implies that the library deals with pruned FFTs in
+a special way, i.e., half of the zeros are added at the beginning of the
+inputs and the other half is added at the end.
+
+Arbitrary shifts
+~~~~~~~~~~~~~~~~
 
-\subsection{Arbitrary shifts}
 More general shifts must be done by the user.
 
+In a more general setting, we are interested in the computation of FFTs
+with shifted index sets, i.e., assume :math:`k_s,l_s\in\Z` and compute
+
+.. math::
+
+   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{kl/n},
+     \quad l=l_s,\hdots,n_o+l_s-1\,.
+
+Because of the periodicity of the FFT this can be easily performed by
+ [alg:fftshift:sub:`t`\ ranslation].
+
+[alg:fftshift:sub:`t`\ ranslation]
+
+[1] =1.1ex For :math:`k=0,\hdots,n_i-1` assign
+:math:`\hat f_k = \hat g_{(k+k_s\bmod n_i)}`. For
+:math:`l=0,\hdots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
+:math:`l=0,\hdots,n_o-1` assign :math:`g_l = f_{(l-l_s\bmod n_o)}`.
+
+However, this involves explicit data movement since the sequence of data
+changes. For a our parallel data decomposition the change of data layout
+requires data communication. A simple index shift results in the
+computation of
+
+.. math::
+
+   \begin{aligned}
+     g_{l+l_s}
+     &=
+       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{k(l+l_s)/n}
+       =
+       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} \eim{(k+k_s)(l+l_s)/n} \\
+     &=
+       \eim{k_sl/n} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}\eim{(k+k_s)l_s/n}\right)}_{=: \hat f_k} \eim{kl/n}\end{aligned}
+
+for all :math:`l=0,\hdots,n_o-1`. The resulting
+ [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at the
+price of some extra computation.
+
+[alg:fftshift:sub:`m`\ odulation]
+
+[1] =1.1ex For :math:`k=0,\hdots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k+k_s)} \eim{(k+k_s)l_s/n}`. For
+:math:`l=0,\hdots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
+:math:`l=0,\hdots,n_o-1` compute :math:`g_{(l+l_s)} = f_l \eim{k_sl/n}`.
+
+The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
+corresponds to the shifts the arrays ()
+
+[1] =1.1ex For :math:`k=0,\hdots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} \e^{+\pi\ti (k-\nicefrac{n_i}{2})n_o/n}`.
+For :math:`l=0,\hdots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
+:math:`l=0,\hdots,n_o-1` compute
+:math:`g_{(l-\nicefrac{n_o}{2})} = f_l \e^{+\pi\ti n_i l/n}`.
+
+Parallel pruned FFT
+-------------------
 
-In a more general setting, we are interested in the computation of FFTs with shifted index sets, i.e., assume $k_s,l_s\in\Z$ and compute
-\begin{equation*}
-  g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{kl/n},
-  \quad l=l_s,\hdots,n_o+l_s-1\,.
-\end{equation*}
-Because of the periodicity of the FFT this can be easily performed by \algname~\ref{alg:fftshift_translation}.
-\begin{algorithm}\label{alg:fftshift_translation}
-  \begin{algorithmic}[1]
-    \itemsep=1.1ex
-    \State For $k=0,\hdots,n_i-1$ assign $\hat f_k = \hat g_{(k+k_s\bmod n_i)}$.
-    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
-    \State For $l=0,\hdots,n_o-1$ assign $g_l = f_{(l-l_s\bmod n_o)}$.
-  \end{algorithmic}
-  \caption{Shifted FFT with explicit data movement.}
-\end{algorithm}
-However, this involves explicit data movement since the sequence of data changes.
-For a our parallel data decomposition the change of data layout requires data communication.
-A simple index shift results in the computation of
-\begin{align*}
-  g_{l+l_s}
-  &=
-    \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{k(l+l_s)/n}
-    =
-    \sum_{k=0}^{n_i-1} \hat g_{k+k_s} \eim{(k+k_s)(l+l_s)/n} \\
-  &=
-    \eim{k_sl/n} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}\eim{(k+k_s)l_s/n}\right)}_{=: \hat f_k} \eim{kl/n}
-\end{align*}
-for all $l=0,\hdots,n_o-1$. The resulting \algname~\ref{alg:fftshift_modulation} preserves the sequence of
-data at the price of some extra computation.
-\begin{algorithm}\label{alg:fftshift_modulation}
-  \begin{algorithmic}[1]
-    \itemsep=1.1ex
-    \State For $k=0,\hdots,n_i-1$ compute $\hat f_k = \hat g_{(k+k_s)} \eim{(k+k_s)l_s/n}$.
-    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
-    \State For $l=0,\hdots,n_o-1$ compute $g_{(l+l_s)} = f_l \eim{k_sl/n}$.
-  \end{algorithmic}
-  \caption{Shifted FFT without explicit data movement.}
-\end{algorithm}
-
-The special case $k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}$ corresponds to the shifts the arrays (\textsf{FFTSHIFT})
-\begin{algorithm}
-  \begin{algorithmic}[1]
-    \itemsep=1.1ex
-    \State For $k=0,\hdots,n_i-1$ compute $\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} \e^{+\pi\ti (k-\nicefrac{n_i}{2})n_o/n}$.
-    \State For $l=0,\hdots,n_o-1$ compute $f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}$ using PFFT.
-    \State For $l=0,\hdots,n_o-1$ compute $g_{(l-\nicefrac{n_o}{2})} = f_l \e^{+\pi\ti n_i l/n}$.
-  \end{algorithmic}
-\end{algorithm}
-
-
-
-
-%------------------------------------------------------------------------------
-\section{Parallel pruned FFT}
-%------------------------------------------------------------------------------
 Within PFFT we define a pruned FFT as
-\begin{equation*}
-  g_l = \sum_{k=0}^{n_i-1} \hat g_{k} \eim{kl/n}, \quad l=0,\hdots,n_o-1.
-\end{equation*}
-Formally, this is equivallent to the following regular size $n$ FFT
-\begin{equation*}
-  f_l = \sum_{k=0}^{n-1} \hat f_{k} \eim{kl/n}, \quad l=0,\hdots,n,
-\end{equation*}
-with 
-\begin{equation*}
-  \hat g_k := 
-  \begin{cases}
-  \hat f_k, &: k=0,\hdots,n_1-1, \\
-  0         &: k=n_i,\hdots,n-1,    
-  \end{cases}
-\end{equation*}
-and $f_l := g_l$, $k=0,\hdots,n_o-1$. I.e., we add $n-n_i$ zeros at the end of the input array and throw away $n-n_o$ entries at the end of the output array.
-
-
-The definition of pruned FFT changes for \code{PFFT_SHIFTED_IN} and \code{PFFT_SHIFTED_OUT}.
 
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} \eim{kl/n}, \quad l=0,\hdots,n_o-1.
+
+Formally, this is equivallent to the following regular size :math:`n`
+FFT
+
+.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} \eim{kl/n}, \quad l=0,\hdots,n,
+
+with
+
+.. math::
+
+   \hat g_k := 
+     \begin{cases}
+     \hat f_k, &: k=0,\hdots,n_1-1, \\
+     0         &: k=n_i,\hdots,n-1,    
+     \end{cases}
+
+and :math:`f_l := g_l`, :math:`k=0,\hdots,n_o-1`. I.e., we add
+:math:`n-n_i` zeros at the end of the input array and throw away
+:math:`n-n_o` entries at the end of the output array.
 
+The definition of pruned FFT changes for and .
diff --git a/doc/fortran.rst b/doc/fortran.rst
index 0f2d792..fad7927 100644
--- a/doc/fortran.rst
+++ b/doc/fortran.rst
@@ -1,3 +1,4 @@
-\chapter{Fotran Interface}
+Fotran Interface
+================
 
-based on Fortran 90
\ No newline at end of file
+based on Fortran 90
diff --git a/doc/install.rst b/doc/install.rst
index c8b96bf..87246e5 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -1,103 +1,137 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Installation and linking}\label{chap:inst}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-The install of PFFT is based on the Autotools and follows the typical workflow
-\begin{lstlisting}[escapechar=§]
-./configure
-make
-make install
-\end{lstlisting}
-
-
-\section{Install of the latest official FFTW release}\label{sec:fftw_inst}
-PFFT depends on Release~\fftwversion{} of the FFTW library~\cite{fftw}.
-For the sake of completeness, we show the command line based install procedure in the following.
-However, note that we provide install scripts on \websoft that simplify the install a lot.
-We highly recommend to use these install scripts, since they additionally apply several
-performance patches and bugfixes that have been submitted to the FFTW developers but
-are not yet included in the official FFTW releases.
-\begin{lstlisting}[escapechar=§]
-wget http://www.fftw.org/fftw-§\fftwversionsl§.tar.gz
-tar xzvf fftw-§\fftwversion§.tar.gz
-cd fftw-§\fftwversion§
-./configure --enable-mpi --prefix=$HOME/local/fftw3_mpi §\label{lst:fftw:conf}§
-make
-make install
-\end{lstlisting}
-The MPI algorithms of FFTW must be build with a MPI C compiler. Add the statement \code{MPICC=\$MPICCOMP}
-at the end of line~\ref{lst:fftw:conf} if the \code{configure} script fails to determine the right
-MPI C compiler \code{\$MPICCOMP}. Similarly, the MPI Fortran compiler \code{\$MPIFCOMP} is set by \code{MPIFC=\$MPIFCOMP}.
-
-\section{Install of the PFFT library}\label{sec:pfft-inst}
-In the simplest case, the hardware platform and the \fftw-\fftwversion{} library are
-recognized by the PFFT configure script automatically, so all we have to do is
-\begin{lstlisting}[escapechar=§]
-wget http://www.tu-chemnitz.de/~mpip/software/pfft-§\pfftversionsl§.tar.gz
-tar xzvf pfft-§\pfftversion§.tar.gz
-cd pfft-§\pfftversion§
-./configure
-make
-make check
-make install
-\end{lstlisting}
-Hereby, the optional call \code{make check} builds the test programs.
-If the \fftw-\fftwversion{} software library is already installed on your system but not found by the configure script,
-you can provide the FFTW installation directory \code{\$FFTWDIR} to configure by
-\begin{lstlisting}[language=bash]
-./configure --with-fftw3=$FFTWDIR
-\end{lstlisting}
-This call implies that the FFTW header files are located in \code{\$FFTWDIR/include} and the FFTW library files are located
-in \code{\$FFTWDIR/lib}. Otherwise, one should specify the FFTW include path \code{\$FFTWINC} and the FFTW library path
-\code{\$FFTWLIB} separately by
-\begin{lstlisting}[prebreak = {\textbackslash}]
-./configure --with-fftw3-includedir=$FFTWINC --with-fftw3-libdir=$FFTWLIB
-\end{lstlisting}
+Installation and linking
+========================
+
+The install of PFFT is based on the Autotools and follows the typical
+workflow
+
+::
+
+    ./configure
+    make
+    make install
+
+Install of the latest official FFTW release
+-------------------------------------------
+
+PFFT depends on Release  of the FFTW library . For the sake of
+completeness, we show the command line based install procedure in the
+following. However, note that we provide install scripts on that
+simplify the install a lot. We highly recommend to use these install
+scripts, since they additionally apply several performance patches and
+bugfixes that have been submitted to the FFTW developers but are not yet
+included in the official FFTW releases.
+
+::
+
+    wget http://www.fftw.org/fftw-§\fftwversionsl§.tar.gz
+    tar xzvf fftw-§\fftwversion§.tar.gz
+    cd fftw-§\fftwversion§
+    ./configure --enable-mpi --prefix=$HOME/local/fftw3_mpi §\label{lst:fftw:conf}§
+    make
+    make install
+
+The MPI algorithms of FFTW must be build with a MPI C compiler. Add the
+statement at the end of line [lst:fftw:conf] if the script fails to
+determine the right MPI C compiler . Similarly, the MPI Fortran compiler
+is set by .
+
+Install of the PFFT library
+---------------------------
+
+In the simplest case, the hardware platform and the - library are
+recognized by the PFFT configure script automatically, so all we have to
+do is
+
+::
+
+    wget http://www.tu-chemnitz.de/~mpip/software/pfft-§\pfftversionsl§.tar.gz
+    tar xzvf pfft-§\pfftversion§.tar.gz
+    cd pfft-§\pfftversion§
+    ./configure
+    make
+    make check
+    make install
+
+Hereby, the optional call builds the test programs. If the - software
+library is already installed on your system but not found by the
+configure script, you can provide the FFTW installation directory to
+configure by
+
+.. code:: bash
+
+    ./configure --with-fftw3=$FFTWDIR
+
+This call implies that the FFTW header files are located in and the FFTW
+library files are located in . Otherwise, one should specify the FFTW
+include path and the FFTW library path separately by
+
+::
+
+    ./configure --with-fftw3-includedir=$FFTWINC --with-fftw3-libdir=$FFTWLIB
+
 At the end, this is equivalent to
-\begin{lstlisting}[prebreak = {\textbackslash}]
-./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB
-\end{lstlisting}
-which is more common to experienced users of the Autotools.
-To install PFFT in a user specified directory \code{\$PFFTINSTDIR} call configure with the option
-\begin{lstlisting}
-./configure --prefix=$PFFTINSTDIR
-\end{lstlisting}
-However, this option is mandatory whenever you do not have root permissions on your machine, since the default install paths of 
-\code{configure} are not accessible by standard users.
-The PFFT library must be built with a MPI compiler. In Section~\ref{sec:fftw_inst} we already described how to hand the right compilers to the \code{configure} script.
-Some more options are
-\begin{compactitem}
-  \item \code[keywords=]{--enable-float}: Produces a single-precision version of PFFT (float) instead of the default double-precision (double); see \ref{sec:prec}.
-  \item \code[keywords=]{--enable-long-double}: Produces a long-double precision version of PFFT (long double) instead of the default double-precision (double); see \ref{sec:prec}.
-  \item \code{--disable-fortran}: Disables inclusion of Fortran wrapper routines in the standard PFFT libraries.
-  \item \code{--disable-tests}: Disables build of test programs.
-\end{compactitem}
-For more details on the options of the \code{configure} script call
-\begin{lstlisting}
-./configure --help
-\end{lstlisting}
-
-
-\section{How to include PFFT in your program}
+
+::
+
+    ./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB
+
+which is more common to experienced users of the Autotools. To install
+PFFT in a user specified directory call configure with the option
+
+::
+
+    ./configure --prefix=$PFFTINSTDIR
+
+However, this option is mandatory whenever you do not have root
+permissions on your machine, since the default install paths of are not
+accessible by standard users. The PFFT library must be built with a MPI
+compiler. In Section [sec:fftw\ :sub:`i`\ nst] we already described how
+to hand the right compilers to the script. Some more options are
+
+: Produces a single-precision version of PFFT (float) instead of the
+default double-precision (double); see [sec:prec].
+
+: Produces a long-double precision version of PFFT (long double) instead
+of the default double-precision (double); see [sec:prec].
+
+: Disables inclusion of Fortran wrapper routines in the standard PFFT
+libraries.
+
+: Disables build of test programs.
+
+For more details on the options of the script call
+
+::
+
+    ./configure --help
+
+How to include PFFT in your program
+-----------------------------------
+
 All programs using PFFT should include its header file
-\begin{lstlisting}
-#include <pfft.h>
-\end{lstlisting}
-This header includes the FFTW headers \code{fftw.h}, \code{fftw-mpi.h} automatically. Make sure that the compiler can find them by setting
-the include flags appropriately.
-You must also link to the PFFT, FFTW and FFTW-MPI libraries. On Unix, this means adding \code{-lpfft -lfftw3_mpi -lfftw3 -lm} at the end of the link command.
-For example, to build \code{pfft_test.c} use the following compiler invocation
-\begin{lstlisting}[prebreak = {\textbackslash}]
-mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm
-\end{lstlisting}
-Substitute \code{mpicc} by any other MPI C compiler if you like.
-\code{\$PFFTINC}, \code{\$FFTWINC}, \code{\$PFFTLIB}, and \code{\$FFTWLIB} denote the PFFT and FFTW include and library paths, respectively.
-If you use the install scripts mentioned in Sect.~\ref{sec:pfft-inst}, these paths will be
-\begin{lstlisting}[escapechar=§,numbers=none]
-PFFTINC = $HOME/local/pfft-§\pfftversion§/include
-FFTWINC = $HOME/local/fftw-§\fftwversion§/include
-PFFTINC = $HOME/local/pfft-§\pfftversion§/lib
-FFTWINC = $HOME/local/fftw-§\fftwversion§/lib
-\end{lstlisting}
 
+::
+
+    #include <pfft.h>
+
+This header includes the FFTW headers , automatically. Make sure that
+the compiler can find them by setting the include flags appropriately.
+You must also link to the PFFT, FFTW and FFTW-MPI libraries. On Unix,
+this means adding at the end of the link command. For example, to build
+use the following compiler invocation
+
+::
+
+    mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm
+
+Substitute by any other MPI C compiler if you like. , , , and denote the
+PFFT and FFTW include and library paths, respectively. If you use the
+install scripts mentioned in Sect. [sec:pfft-inst], these paths will be
+
+::
+
+    PFFTINC = $HOME/local/pfft-§\pfftversion§/include
+    FFTWINC = $HOME/local/fftw-§\fftwversion§/include
+    PFFTINC = $HOME/local/pfft-§\pfftversion§/lib
+    FFTWINC = $HOME/local/fftw-§\fftwversion§/lib
 
diff --git a/doc/interface.rst b/doc/interface.rst
index 3bbfb58..ce3cf84 100644
--- a/doc/interface.rst
+++ b/doc/interface.rst
@@ -1,95 +1,117 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Interface Layers of the PFFT Library}\label{chap:api}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-We give a quick overview of the PFFT interface layers in the order of increasing flexibility at the example of c2c-FFTs.
-For r2c-, c2r-, and r2r-FFT similar interface layer specifications apply. A full reference list of all PFFT functions is given in Chapter~\ref{chap:ref}. 
-\section{Basic Interface}
-The \code{_3d} interface is the simplest interface layer. It is suitable for the planning of three-dimensional FFTs.
-\begin{lstlisting}
-ptrdiff_t pfft_local_size_dft_3d(
-    const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-void pfft_local_block_dft_3d(
-    const ptrdiff_t *n, MPI_Comm comm_cart,
-    int pid, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-pfft_plan pfft_plan_dft_3d(
-    const ptrdiff_t *n,
-    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
-    int sign, unsigned pfft_flags);
-\end{lstlisting}
-Hereby, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, and \code{local_o_start} are
-\code{ptrdiff_t} arrays of length \code{3}.
-
-The basic interface generalizes the \code{_3d} interface to FFTs of arbitrary dimension \code{rnk_n}.
-\begin{lstlisting}
-ptrdiff_t pfft_local_size_dft(
-    int rnk_n, const ptrdiff_t *n,
-    MPI_Comm comm_cart, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-void pfft_local_block_dft(
-    int rnk_n, const ptrdiff_t *n,
-    MPI_Comm comm_cart, int pid, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-pfft_plan pfft_plan_dft(
-    int rnk_n, const ptrdiff_t *n,
-    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
-    int sign, unsigned pfft_flags);
-\end{lstlisting}
-Therefore, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, and \code{local_o_start} become
-arrays of length \code{rnk_n}.
-
-\section{Advanced Interface}
-The advanced interface introduces the arrays \code{ni} and \code{no} of length \code{rnk_n}
-that give the pruned FFT input and output size.
-Furthermore, the arrays \code{iblock} and \code{oblock} of length \code{rnk_pm} (\code{rnk_pm} being the dimension of the process mesh)
-serve to adjust the block size of the input and output block decomposition.
-The additional parameter \code{howmany} gives the number of transforms that will be computed simultaneously.
-\begin{lstlisting}
-ptrdiff_t pfft_local_size_many_dft(
-    int rnk_n, const ptrdiff_t *n,
-    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
-    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
-    MPI_Comm comm_cart, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-void pfft_local_block_many_dft(
-    int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
-    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
-    MPI_Comm comm_cart, int pid, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-pfft_plan pfft_plan_many_dft(
-    int rnk_n, const ptrdiff_t *n,
-    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
-    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
-    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
-    int sign, unsigned pfft_flags);
-\end{lstlisting}
-
-
-\section{Preliminary: Skip Serial Transformations}\label{sec:skip-trafo}
-The \code{_skipped} interface extends the \code{_many} interface by adding the possibility to skip some of the serial FFTs.
-\begin{lstlisting}
-pfft_plan pfft_plan_many_dft_skipped(
-    int rnk_n, const ptrdiff_t *n,
-    const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
-    const ptrdiff_t *iblock, const ptrdiff_t *oblock,
-    (red@const int *skip_trafos,@*)
-    pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
-    int sign, unsigned pfft_flags);
-\end{lstlisting}
-Hereby, \code{skip_trafos} is an \code{int} array of length \code{rnk_pm+1} (\code{rnk_pm} being the mesh dimension of the communicator \code{comm_cart}).
-For \code{t=0,...,rnk_pm} set \code{skip_trafos[t]=1} if the \code{t}-th serial transformation should be computed, otherwise set \code{skip_trafos[t]=0}.
-Note that the local transpositions are always performed, since they are a prerequisite for the global communication to work.
-At the moment it is only possible to skip the whole serial transform along the last \code{rnk_n-rnk_pm-1} dimensions.
-However, this behaviour can be realized by a call of a \code{(rnk_pm+1)}-dimensional PFFT with
-\begin{lstlisting}
-for(int t=rnk_pm+1; t<rnk_n; t++)
-  howmany *= n[t];
-\end{lstlisting}
-and manual computation of the desired serial transforms along the last \code{rnk_n-rnk_pm-1} dimensions.
+Interface Layers of the PFFT Library
+====================================
+
+We give a quick overview of the PFFT interface layers in the order of
+increasing flexibility at the example of c2c-FFTs. For r2c-, c2r-, and
+r2r-FFT similar interface layer specifications apply. A full reference
+list of all PFFT functions is given in Chapter [chap:ref].
+
+Basic Interface
+---------------
+
+The interface is the simplest interface layer. It is suitable for the
+planning of three-dimensional FFTs.
+
+::
+
+    ptrdiff_t pfft_local_size_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart,
+        int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_dft_3d(
+        const ptrdiff_t *n,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Hereby, , , , , and are arrays of length .
+
+The basic interface generalizes the interface to FFTs of arbitrary
+dimension .
+
+::
+
+    ptrdiff_t pfft_local_size_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_dft(
+        int rnk_n, const ptrdiff_t *n,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Therefore, , , , , and become arrays of length .
+
+Advanced Interface
+------------------
+
+The advanced interface introduces the arrays and of length that give the
+pruned FFT input and output size. Furthermore, the arrays and of length
+( being the dimension of the process mesh) serve to adjust the block
+size of the input and output block decomposition. The additional
+parameter gives the number of transforms that will be computed
+simultaneously.
+
+::
+
+    ptrdiff_t pfft_local_size_many_dft(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    pfft_plan pfft_plan_many_dft(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Preliminary: Skip Serial Transformations
+----------------------------------------
+
+The interface extends the interface by adding the possibility to skip
+some of the serial FFTs.
+
+::
+
+    pfft_plan pfft_plan_many_dft_skipped(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        (red@const int *skip_trafos,@*)
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Hereby, is an array of length ( being the mesh dimension of the
+communicator ). For set if the -th serial transformation should be
+computed, otherwise set . Note that the local transpositions are always
+performed, since they are a prerequisite for the global communication to
+work. At the moment it is only possible to skip the whole serial
+transform along the last dimensions. However, this behaviour can be
+realized by a call of a -dimensional PFFT with
+
+::
+
+    for(int t=rnk_pm+1; t<rnk_n; t++)
+      howmany *= n[t];
+
+and manual computation of the desired serial transforms along the last
+dimensions.
diff --git a/doc/intro.rst b/doc/intro.rst
index ec7ae97..45100a1 100644
--- a/doc/intro.rst
+++ b/doc/intro.rst
@@ -1,94 +1,153 @@
-\abstract{
-This user manual describes the usage of PFFT~\pfftversion~\cite{pfft,Pi13}, a MPI-based, parallel software library for the
-computation of equispaced fast Fourier transforms (FFT) on parallel, distributed memory architectures.
-The reader of this manual should familiar with the basic usage of FFTW and MPI.
-For further information we refer to the well written FFTW user manual~\cite{fftw-manual} and
-the MPI Standard~\cite{MPI-2.2}, see also \cite{GrLuTh99} for detailed explanations.}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Introduction}\label{chap:intro}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-A popular software library for computing FFTs is FFTW~\cite{fftw, FFTW05}. This library also includes a parallel FFT implementation (FFTW-MPI) based on the Message Passing Interface (MPI).
-FFTW-MPI parallelizes multi-dimensional FFTs by a mixture of serial lower-dimensional FFTs and parallel data transpositions.
-However, FFTW-MPI makes use of a one-dimensional data decomposition, which shows to be a scalability bottleneck on large scale, parallel computers.
-For example, a three-dimensional FFT of size $1024^3$ can be computed with at most $1024$ MPI processes.
-In contrast, using a two-dimensional data decomposition would increase the maximum number of MPI processes to $1024^2$ in this case.
-
-The main goal of PFFT is to extend the MPI part of the FFTW software library to multi-dimensional data decompositions,
-i.e., $d$-dimensional FFTs of size $N^d$ can be computed in parallel with at most $N^{d-1}$ MPI processes.
-In addition, PFFT offers several extra features that are particular usefull for parallel, distributed memory FFTs but are not yet present in FFTW-MPI.
-We refer to the publication~\cite{Pi13} for a closer look on the different data decompositions and the underlying algorithms of the PFFT library.
-
-The interface of PFFT is as close as possible to the FFTW-MPI interface. 
-In fact, we consider every difference between PFFT and FFTW that is not explicitly mentioned within this manual as a bug that should be reported to \webpfft.
-Therefore, porting code that uses FFTW-MPI to PFFT is almost trivial, e.g. see Section~\ref{sec:porting}.
-
-Most features of PFFT are inherited from FFTW or similarily implemented. These include the following:
-\begin{compactitem}
-  \item We employ fast $\mathcal{O}(N\log N)$ algorithms of FFTW to compute arbitrary-size
-        discrete Fourier transforms of complex data, real data, and even- or odd-symmetric real data.
-  \item The dimension of the FFT can be arbitrary. However, parallel data decomposition must be at least one dimension smaller.
-  \item PFFT offers portable performance; e.g., it will perform well on most platforms.
-  \item The application of PFFT is split into a time consuming planning step and a high performance execution step.
-  \item Installing the library is easy. It is based on the common sequence of configure, make, and make install.
-  \item The interface of PFFT is very close to the MPI interface of FFTW.
-        In fact, we tried to add as few extra parameters as possible.
-  \item PFFT is written in C but also offers a Fortran interface, see Section~\ref{sec:fortran}.
-  \item FFTW includes shared memory parallelism for all serial transforms. This enables us to benefit from hybrid parallelism to a certain amount, see Section~\ref{sec:openmp}.
-  \item All steps of our parallel FFT can be performed completely in place. This is especially remarkable for the global
-        transposition routines.
-  \item Confirming to good MPI programming practice, all PFFT transforms can be performed on user defined communicators.
-        In other words, PFFT does not enforce the user to work with \verb+MPI_COMM_WORLD+.
-  \item PFFT uses the same algorithm to compute the size of the local array blocks as FFTW. This implies that the FFT size need not
-        be divisible by the number of processes.
-  \item PFFT supports single, double and long double precision.
-  \item PFFT supports new-array execution, i.e., a PFFT plan can be planned and executed on different plans up to some restrictions, see Section~\ref{sec:new-array} for details.
-        Thanks to Yu Feng for the new-array execute patch.
-\end{compactitem}
-Furthermore, we added some special features to support repeated tasks that often occur in practical application of parallel FFTs.
-\begin{compactitem}
-  \item PFFT includes a very flexible ghost cell exchange module. A detailed description of this module is given in Section~\ref{sec:gc}.
-  \item PFFT accepts three-dimensional data decomposition even for three-dimen\-sional FFTs.
-        However, the underlying parallel FFT framework is still based on two-dimensional decomposition. A more detailed description can be found
-        in Section~\ref{sec:3don2d}.
-  \item PFFT explicitly supports the parallel calculation of pruned FFTs. Details are given in Section~\ref{sec:pruned}.
-\end{compactitem}
-
-Finally, we complete this overview with a list of features that are (not yet) implemented in PFFT.
-\begin{compactitem}
-  \item Parallel one-dimensional FFT based on MPI. FFTW-MPI uses another parallelization strategy for one-dimensional FFTs, which is not implemented in PFFT.
-        The reason is that we can not achive a scalability benefit due to higher dimensional data decomposition if the FFT has only one dimension.
-        Therefore, one can also call FFTW directly in this case.
-  \item There is no equivalent of FFTW \emph{wisdom} in PFFT, i.e., you can not save a PFFT plan to disk and restore it for later use.
-  \item PFFT does not have full OpenMP support. All serial FFT computations and global communications are implemented with FFTW,
-        which offers OpenMP support, see Section~\ref{sec:openmp}. However, most of the PFFT-only features, such as pruned FFT, ghost cell send and 3d decompostion of 3d FFTs are not yet parallelized with OpenMP.
-  \item PFFT does not have full SIMD support. All serial FFT computations and global communications are implemented with FFTW,
-        which offers SIMD support, see Section~\ref{sec:simd}. However, most of the PFFT-only features, such as pruned FFT, ghost cell send and 3d decompostion of 3d FFTs are not yet parallelized with SIMD.
-  \item PFFT does not overlap communication and computation. The code of PFFT is build in a very modularized structure. Most of these modules consist
-        of FFTWs routines. Therefore, the global transposition does not support non blocking communication.
-  \item Similar to FFTW, we do not provide any parallel IO routines. The user is responsible of load and store of parallel data.
-  \item PFFT depends on FFTW to perform its serial transforms and does not support different vendor FFTs (such as Intel's MKL or IBM's ESSL).
-        However, this is not assumed to be a big drawback, since FFTW seems to perform very well on most platforms.
-%         We apply FFTW to multi-dimensional data sets in order to compute serial FFTs along single dimensions combined with transpositions of the multi-dimensional data in one step.
-%         As far as we know, there is no other FFT library that performs these two tasks. In addition, we use FFTW for local and global, i.e. serial and parallel, data transpositions.
-%         Thereby, changing the FFT vendor would affect only a 
-%         However, this is not assumed to be a big drawback, since FFTW seems to perform very well on most platforms.
-  \item The global communication routines can not be called separately. However, it should be possible to implement a user interface to our global
-        transposition routines.
-  \item PFFT does not support GPU parallelization.
-\end{compactitem}
-You are welcome to propose new PFFT features at \webpfft.
-
-\section{Alternative parallel FFT implementations}
-There have been several FFT implementations that aim to circumvent the scalability bottleneck
-for at least three dimensional FFTs by using two-dimensional decomposition approach.
-However, these implementations are often fitted to special problems and where not published
-as a stand alone software library. 
-Remarkable exceptions are the parallel FFT software library by S.~Plimpton~\cite{Pl97,sandiafft},
-the P3DFFT software library by D.~Pekurovsky~\cite{Pe12,p3dfft} and the \mbox{2DECOMP\&FFT} software library by N.~Li~\cite{Li2010, 2decompfft}.
-
-\section{Parallel nonequispaced FFT}
-If your are interested in a parallel implementation of nonequispaced fast Fourier
-transforms (NFFT) for distributed memory architectures, you should have a look at our PNFFT software library~\cite{pnfft, PiPo13}
-that is also available at \webpnfft.
+Introduction
+============
 
+A popular software library for computing FFTs is FFTW . This library
+also includes a parallel FFT implementation (FFTW-MPI) based on the
+Message Passing Interface (MPI). FFTW-MPI parallelizes multi-dimensional
+FFTs by a mixture of serial lower-dimensional FFTs and parallel data
+transpositions. However, FFTW-MPI makes use of a one-dimensional data
+decomposition, which shows to be a scalability bottleneck on large
+scale, parallel computers. For example, a three-dimensional FFT of size
+:math:`1024^3` can be computed with at most :math:`1024` MPI processes.
+In contrast, using a two-dimensional data decomposition would increase
+the maximum number of MPI processes to :math:`1024^2` in this case.
+
+The main goal of PFFT is to extend the MPI part of the FFTW software
+library to multi-dimensional data decompositions, i.e.,
+:math:`d`-dimensional FFTs of size :math:`N^d` can be computed in
+parallel with at most :math:`N^{d-1}` MPI processes. In addition, PFFT
+offers several extra features that are particular usefull for parallel,
+distributed memory FFTs but are not yet present in FFTW-MPI. We refer to
+the publication  for a closer look on the different data decompositions
+and the underlying algorithms of the PFFT library.
+
+The interface of PFFT is as close as possible to the FFTW-MPI interface.
+In fact, we consider every difference between PFFT and FFTW that is not
+explicitly mentioned within this manual as a bug that should be reported
+to . Therefore, porting code that uses FFTW-MPI to PFFT is almost
+trivial, e.g. see Section [sec:porting].
+
+Most features of PFFT are inherited from FFTW or similarily implemented.
+These include the following:
+
+We employ fast :math:`\mathcal{O}(N\log N)` algorithms of FFTW to
+compute arbitrary-size discrete Fourier transforms of complex data, real
+data, and even- or odd-symmetric real data.
+
+The dimension of the FFT can be arbitrary. However, parallel data
+decomposition must be at least one dimension smaller.
+
+PFFT offers portable performance; e.g., it will perform well on most
+platforms.
+
+The application of PFFT is split into a time consuming planning step and
+a high performance execution step.
+
+Installing the library is easy. It is based on the common sequence of
+configure, make, and make install.
+
+The interface of PFFT is very close to the MPI interface of FFTW. In
+fact, we tried to add as few extra parameters as possible.
+
+PFFT is written in C but also offers a Fortran interface, see
+Section [sec:fortran].
+
+FFTW includes shared memory parallelism for all serial transforms. This
+enables us to benefit from hybrid parallelism to a certain amount, see
+Section [sec:openmp].
+
+All steps of our parallel FFT can be performed completely in place. This
+is especially remarkable for the global transposition routines.
+
+Confirming to good MPI programming practice, all PFFT transforms can be
+performed on user defined communicators. In other words, PFFT does not
+enforce the user to work with ``MPI_COMM_WORLD``.
+
+PFFT uses the same algorithm to compute the size of the local array
+blocks as FFTW. This implies that the FFT size need not be divisible by
+the number of processes.
+
+PFFT supports single, double and long double precision.
+
+PFFT supports new-array execution, i.e., a PFFT plan can be planned and
+executed on different plans up to some restrictions, see
+Section [sec:new-array] for details. Thanks to Yu Feng for the new-array
+execute patch.
+
+Furthermore, we added some special features to support repeated tasks
+that often occur in practical application of parallel FFTs.
+
+PFFT includes a very flexible ghost cell exchange module. A detailed
+description of this module is given in Section [sec:gc].
+
+PFFT accepts three-dimensional data decomposition even for
+three-dimensional FFTs. However, the underlying parallel FFT framework
+is still based on two-dimensional decomposition. A more detailed
+description can be found in Section [sec:3don2d].
+
+PFFT explicitly supports the parallel calculation of pruned FFTs.
+Details are given in Section [sec:pruned].
+
+Finally, we complete this overview with a list of features that are (not
+yet) implemented in PFFT.
+
+Parallel one-dimensional FFT based on MPI. FFTW-MPI uses another
+parallelization strategy for one-dimensional FFTs, which is not
+implemented in PFFT. The reason is that we can not achive a scalability
+benefit due to higher dimensional data decomposition if the FFT has only
+one dimension. Therefore, one can also call FFTW directly in this case.
+
+There is no equivalent of FFTW *wisdom* in PFFT, i.e., you can not save
+a PFFT plan to disk and restore it for later use.
+
+PFFT does not have full OpenMP support. All serial FFT computations and
+global communications are implemented with FFTW, which offers OpenMP
+support, see Section [sec:openmp]. However, most of the PFFT-only
+features, such as pruned FFT, ghost cell send and 3d decompostion of 3d
+FFTs are not yet parallelized with OpenMP.
+
+PFFT does not have full SIMD support. All serial FFT computations and
+global communications are implemented with FFTW, which offers SIMD
+support, see Section [sec:simd]. However, most of the PFFT-only
+features, such as pruned FFT, ghost cell send and 3d decompostion of 3d
+FFTs are not yet parallelized with SIMD.
+
+PFFT does not overlap communication and computation. The code of PFFT is
+build in a very modularized structure. Most of these modules consist of
+FFTWs routines. Therefore, the global transposition does not support non
+blocking communication.
+
+Similar to FFTW, we do not provide any parallel IO routines. The user is
+responsible of load and store of parallel data.
+
+PFFT depends on FFTW to perform its serial transforms and does not
+support different vendor FFTs (such as Intel’s MKL or IBM’s ESSL).
+However, this is not assumed to be a big drawback, since FFTW seems to
+perform very well on most platforms.
+
+The global communication routines can not be called separately. However,
+it should be possible to implement a user interface to our global
+transposition routines.
+
+PFFT does not support GPU parallelization.
+
+You are welcome to propose new PFFT features at .
+
+Alternative parallel FFT implementations
+----------------------------------------
+
+There have been several FFT implementations that aim to circumvent the
+scalability bottleneck for at least three dimensional FFTs by using
+two-dimensional decomposition approach. However, these implementations
+are often fitted to special problems and where not published as a stand
+alone software library. Remarkable exceptions are the parallel FFT
+software library by S. Plimpton , the P3DFFT software library by
+D. Pekurovsky  and the software library by N. Li .
+
+Parallel nonequispaced FFT
+--------------------------
+
+If your are interested in a parallel implementation of nonequispaced
+fast Fourier transforms (NFFT) for distributed memory architectures, you
+should have a look at our PNFFT software library  that is also available
+at .
diff --git a/doc/reference.rst b/doc/reference.rst
new file mode 100644
index 0000000..3e9537d
--- /dev/null
+++ b/doc/reference.rst
@@ -0,0 +1,1550 @@
+PFFT Reference
+==============
+
+Files and Data Types
+--------------------
+
+You must include the PFFT header file by
+
+::
+
+    #include <pfft.h>
+
+in the preamble of each source file that calls PFFT. This header
+automatically includes and . Therefore, PFFT can use the data type
+defined in , see . Note that is defined to be the C99 native complex
+whenever is included *before* , and . Otherwise it is defined as
+
+::
+
+    typedef double fftw_complex[2];
+
+For the sake of a clean namespace we define the wrapper data type as
+
+::
+
+    typedef fftw_complex pfft_complex;
+
+that can be used equivallently to . Futhermore, we define the wrapper
+functions
+
+::
+
+    void *pfft_malloc(size_t n);
+    double *pfft_alloc_real(size_t n);
+    pfft_complex *pfft_alloc_complex(size_t n);
+    void pfft_free(void *p);
+
+as substitues for their corresponding FFTW equivalents, see . Note that
+memory allocated by one of these functions must be freed with (or its
+equivalent ). Because of the performance reasons given in  we recommend
+to use one of the (or its equivalent ) allocation functions for all
+arrays containing FFT inputs and outputs. However, PFFT will also work
+(possibly slower) with any other memory allocation method.
+
+Different precisions are handled as in FFTW: That is functions and
+datatypes become (single precision) or (long double precision) prefixed.
+Quadruple precision is not yet supported. The main problem is that we do
+not know about a suitable MPI datatype to represent .
+
+MPI Initialization
+------------------
+
+Initialization and cleanup of PFFT in done in the same way as for
+FFTW-MPI, see . In order to keep a clean name space, PFFT offers the
+wrapper functions
+
+::
+
+    void pfft_init(void);
+    void pfft_cleanup(void);
+
+that can be used as substitutes for and , respectively.
+
+Using PFFT Plans
+----------------
+
+PFFT follows exactly the same workflow as FFTW-MPI. A plan created by
+one of the functions given in Section [sec:create-plan] is executed with
+
+::
+
+    void pfft_execute(const pfft_plan plan);
+
+and freed with
+
+::
+
+    void pfft_destroy_plan(const pfft_plan plan);
+
+Note, that you can *not* apply or on PFFT plans.
+
+The new array execute functions are given by
+
+::
+
+    void pfft_execute_dft(const pfft_plan plan, pfft_complex *in, pfft_complex *out);
+    void pfft_execute_dft_r2c(const pfft_plan plan, double *in, pfft_complex *out);
+    void pfft_execute_dft_c2r(const pfft_plan plan, pfft_complex *in, double *out);
+    void pfft_execute_r2r(const pfft_plan plan, double *in, double *out);
+
+The arrays given by and must have the correct size and the same
+alignement as the array that were used to create the plan, just as it is
+the case for FFTW, see [fftw-new-array].
+
+Data Distribution Functions
+---------------------------
+
+Complex-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex input/output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments:
+
+is the rank of the transform (typically the size of the arrays , , )
+that can be any integer :math:`\ge 2`. The planner corresponds to a of
+3.
+
+The array of size specifies the transform dimensions. They can be any
+positive integer.
+
+The array of size specifies the input array dimensions. They can be any
+positive integer with for all dimensions . For the inputs will be padded
+with zeros up to size along the -th dimension before the transform, see
+Section [sec:pruned-fft].
+
+The array of size specifies the output array dimensions. They can be any
+positive integer with for all dimensions . For the outputs will be
+pruned to size along the -th dimension after the transform, see
+Section [sec:pruned-fft].
+
+is the number of transforms to compute. The resulting plan computes
+howmany transforms, where the input of the k-th transform is at location
+in+k (in C pointer arithmetic) with stride , and its output is at
+location out+k with stride . The basic interface corresponds to
+howmany=1.
+
+is a Cartesian communicator of dimension that specifies the parallel
+data decomposition, see Section [sec:data-decomp]. Most of the time,
+PFFT requires . The only exception is the case , see
+Section [sec:3don3d]. If an ordinary (i.e. non-Cartesian) communicator
+is passed, PFFT internally converts it into a one-dimensional Cartesian
+communicator while retaining the MPI ranks (this results in the FFTW-MPI
+data decomposition).
+
+The arrays and of size specify the block sizes for the first dimensions
+of the input and output data, respectively. These must be the same block
+sizes as were passed to the corresponding function. You can pass to use
+PFFT’s default block sizes. Furthermore, you can use to set the default
+block size in separate dimensions, e.g., .
+
+is a bitwise OR (’’) of zero or more planner flags, as defined in
+Section [sec:flags].
+
+The array of size returns the size of the local input array block in
+every dimension (counted in units of complex numbers).
+
+The array of size returns the offset of the local input array block in
+every dimension (counted in units of complex numbers).
+
+The array of size returns the size of the local output array block in
+every dimension (counted in units of complex numbers).
+
+The array of size returns the offset of the local output array block in
+every dimension (counted in units of complex numbers).
+
+In addition, the following functions compute the local data distribution
+of the process with MPI rank . The interface can be understood as a call
+of where is given by , i.e., each MPI process computes its own data
+block. However, functions have a return type, i.e., they omit the
+computation of the local array size that is necessary to hold the
+parallel transform. This makes functions substantially faster in
+exectuion.
+
+::
+
+    void pfft_local_block_dft_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Real-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_r2c_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft_r2c(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, real-input/complex-output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+The logical input array size will differ from the physical array size of
+the real inputs if the flag is included in . This results from the
+padding at the end of the last dimension that is necessary to align the
+real valued inputs and complex valued outputs for inplace transforms,
+see . In contrast to FFTW-MPI, PFFT does not pad the r2c inputs per
+default.
+
+is counted in units of real numbers. It will include padding
+
+is counted in units of real numbers.
+
+The corresponding functions compute the local data distribution of the
+process with MPI rank .
+
+::
+
+    void pfft_local_block_dft_r2c_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_r2c(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Complex-to-Real FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_dft_c2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_dft_c2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex-input/real-output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *complex* numbers that must be allocated to hold the
+parallel transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+The logical output array size will differ from the physical array size
+of the real outputs if the flag is included in . This results from the
+padding at the end of the last dimension that is necessary to align the
+real valued outputs and complex valued inputs for inplace transforms,
+see . In contrast to FFTW-MPI, PFFT does not pad the c2r outputs per
+default.
+
+is counted in units of real numbers.
+
+is counted in units of real numbers.
+
+The corresponding functions compute the local data distribution of the
+process with MPI rank .
+
+::
+
+    void pfft_local_block_dft_c2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_dft_c2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Real-to-Real FFT
+~~~~~~~~~~~~~~~~
+
+::
+
+    ptrdiff_t pfft_local_size_r2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_r2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    ptrdiff_t pfft_local_size_many_r2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Compute the data distribution of a parallel, complex input/output
+discrete Fourier transform (DFT) in two or more dimensions, returning
+the number of *real* numbers that must be allocated to hold the parallel
+transform.
+
+Arguments are the same as for c2c transforms (see
+Section [sec:local-size-c2c]) with the following exceptions:
+
+is counted in units of real numbers.
+
+is counted in units of real numbers.
+
+is counted in units of real numbers.
+
+is counted in units of real numbers.
+
+The corresponding functions compute the local data distribution of the
+process with MPI rank .
+
+::
+
+    void pfft_local_block_r2r_3d(
+        const ptrdiff_t *n, MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_r2r(
+        int rnk_n, const ptrdiff_t *n,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+    void pfft_local_block_many_r2r(
+        int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        MPI_Comm comm_cart, int pid, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Plan Creation
+-------------
+
+Complex-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_3d(
+        const ptrdiff_t *n, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft(
+        int rnk_n, const ptrdiff_t *n, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, complex input/output discrete Fourier transform (DFT)
+in two or more dimensions, returning an . The planner returns NULL if
+the plan cannot be created.
+
+Arguments:
+
+, , , , , , , must be the same as passed to the corresponding function,
+see Section [sec:local-size-c2c].
+
+The array of size specifies the serial transforms that will be omitted.
+For set if the -th serial transformation should be computed, otherwise
+set , see Section [sec:skip-trafo] for more details.
+
+and point to the complex valued input and output arrays of the
+transform, which may be the same (yielding an in-place transform). These
+arrays are overwritten during planning, unless is used in the flags.
+(The arrays need not be initialized, but they must be allocated.)
+
+is the sign of the exponent in the formula that defines the Fourier
+transform. It can be -1 (= ) or +1 (= ).
+
+is a bitwise OR (’’) of zero or more planner flags, as defined in
+Section [sec:flags].
+
+PFFT computes an unnormalized transform: computing a forward followed by
+a backward transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the dimensions
+).
+
+Real-to-Complex FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_r2c_3d(
+        const ptrdiff_t *n, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_r2c(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_r2c_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, double *in, pfft_complex *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, real-input/complex-output discrete Fourier transform
+(DFT) in two or more dimensions, returning an . The planner returns NULL
+if the plan cannot be created.
+
+Arguments:
+
+, , , , , , , must be the same as passed to the corresponding function,
+see Section [sec:local-size-r2c].
+
+and point to the real valued input and complex valued output arrays of
+the transform, which may be the same (yielding an in-place transform).
+These arrays are overwritten during planning, unless is used in the
+flags. (The arrays need not be initialized, but they must be allocated.)
+
+is the sign of the exponent in the formula that defines the Fourier
+transform. It can be -1 (= ) or +1 (= ). Note that this parameter is not
+part of the FFTW-MPI interface, where r2c transforms are defined to be
+forward transforms. However, the backward transform can be easily
+realized by an additional conjugation of the complex outputs as done by
+PFFT.
+
+Complex-to-Real FFT
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_dft_c2r_3d(
+        const ptrdiff_t *n, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_c2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_dft_c2r_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, pfft_complex *in, double *out, MPI_Comm comm_cart,
+        int sign, unsigned pfft_flags);
+
+Plan a parallel, complex-input/real-output discrete Fourier transform
+(DFT) in two or more dimensions, returning an . The planner returns NULL
+if the plan cannot be created.
+
+Arguments:
+
+, , , , , , , must be the same as passed to the corresponding function,
+see Section [sec:local-size-c2r].
+
+and point to the complex valued input and real valued output arrays of
+the transform, which may be the same (yielding an in-place transform).
+These arrays are overwritten during planning, unless is used in the
+flags. (The arrays need not be initialized, but they must be allocated.)
+
+is the sign of the exponent in the formula that defines the Fourier
+transform. It can be -1 (= ) or +1 (= ). Note that this parameter is not
+part of the FFTW-MPI interface, where c2r transforms are defined to be
+backward transforms. However, the forward transform can be easily
+realized by an additional conjugation of the complex inputs as done by
+PFFT.
+
+Real-to-Real FFT
+~~~~~~~~~~~~~~~~
+
+::
+
+    pfft_plan pfft_plan_r2r_3d(
+        const ptrdiff_t *n, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_r2r(
+        int rnk_n, const ptrdiff_t *n, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_r2r(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+    pfft_plan pfft_plan_many_r2r_skipped(
+        int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no,
+        ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock,
+        const int *skip_trafos, double *in, double *out, MPI_Comm comm_cart,
+        const pfft_r2r_kind *kinds, unsigned pfft_flags);
+
+Plan a parallel, real input/output (r2r) transform in two or more
+dimensions, returning an . The planner returns NULL if the plan cannot
+be created.
+
+Arguments:
+
+, , , , , , , must be the same as passed to the corresponding function,
+see Section [sec:local-size-r2r].
+
+and point to the real valued input and output arrays of the transform,
+which may be the same (yielding an in-place transform). These arrays are
+overwritten during planning, unless is used in the flags. (The arrays
+need not be initialized, but they must be allocated.)
+
+The array of length specifies the kind of r2r transform that is computed
+in the corresponding dimensions. Just like FFTW-MPI we compute the
+separable product formed by taking each transform kind along the
+corresponding dimension, one dimension after another.
+
+FFT Execution Timer
+-------------------
+
+PFFT offers an easy way to perform run time measurements and print/write
+the results.
+
+Basis Run Time Measurements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PFFT-plans automatically accumulate the local run times of every call to
+. For most applications it is sufficient to print run time of a plan
+averaged over all runs with
+
+::
+
+    void pfft_print_average_timer(
+        const pfft_plan ths, MPI_Comm comm);
+
+Note, that for each timer the maximum time over all processes is reduced
+to rank of communicator , i.e., a call to is performed and the output is
+only printed on this process. The following function works in the same
+way but prints more verbose output
+
+::
+
+    void pfft_print_average_timer_adv(
+        const pfft_plan ths, MPI_Comm comm);
+
+To write the averaged run time of plan into a file called use
+
+::
+
+    void pfft_write_average_timer(
+        const pfft_plan ths, const char *name, MPI_Comm comm);
+    void pfft_write_average_timer_adv(
+        const pfft_plan ths, const char *name, MPI_Comm comm);
+
+Again, the output is only written on rank of communicator .
+
+Discard all the recorded run times with
+
+::
+
+    void pfft_reset_timer(
+        pfft_plan ths);
+
+This function is called per default at the end of every PFFT plan
+creation function.
+
+Advanced Timer Manipulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to access the run times directly a new typedef is introduced.
+The following function returns a copy of the timer corresponding to PFFT
+plan
+
+::
+
+    pfft_timer pfft_get_timer(
+        const pfft_plan ths);
+
+Note that the memory of the returned must be released with
+
+::
+
+    void pfft_destroy_timer(
+        pfft_timer ths);
+
+as soon as the timer is not needed anymore.
+
+In the following we introduce some routines to perform basic operations
+on timers. For all functions with a return value you must use in order
+to release the allocated memory of the timer. Create a copy of a
+PFFT-timer with
+
+::
+
+    pfft_timer pfft_copy_timer(
+        const pfft_timer orig);
+
+Compute the average, local time over all runs of a timer with
+
+::
+
+    void pfft_average_timer(
+        pfft_timer ths);
+
+Create a new timer that contains the sum of two timers and with
+
+::
+
+    pfft_timer pfft_add_timers(
+        const pfft_timer sum1, const pfft_timer sum2);
+
+Create a timer that contains the maximum times of all the timers from
+all processes belonging to communicator with
+
+::
+
+    pfft_timer pfft_reduce_max_timer(
+        const pfft_timer ths, MPI_Comm comm);
+
+Since this function calls , only the first process (rank 0) of will get
+the desired data while all the other processes have timers with
+undefined values.
+
+Note, that you can not access the elements of a timer directly, since it
+is only a pointer to a . However, PFFT offers a routine that creates an
+array and copies all the entries of the timer into it
+
+::
+
+    double* pfft_convert_timer2vec(
+        const pfft_timer ths);
+
+Remember to use in order to release the allocated memory of the returned
+array at the moment it is not needed anymore. The entries of the
+returned array are ordered as follows:
+
+dimension of the process mesh
+
+number of serial trafos
+
+number of global remaps
+
+number of runs
+
+local run time of all runs
+
+local times of the serial trafos
+
+local times of the global remaps
+
+2 times of the global remaps that are only necessary for
+three-dimensional FFTs on three-dimensional process meshes
+
+time for computing twiddled input (as needed for )
+
+time for computing twiddled output (as needed for )
+
+The complementary function
+
+::
+
+    pfft_timer pfft_convert_vec2timer(
+        const double *times);
+
+creates a timer and fills it’s entries with the data from array .
+Thereby, the entries of must be in the same order as above.
+
+Ghost Cell Communication
+------------------------
+
+In the following we describe the PFFT ghost cell communication module.
+At the moment, PFFT ghost cell communication is restricted to
+three-dimensional arrays.
+
+Assume a three-dimensional array of size that is distributed in blocks
+such that each process has a local copy of with
+
+::
+
+    local_start[t] <= k[t] < local_start[t] + local_n[t]
+
+Here and in the following, we assume . The “classical” ghost cell
+exchange communicates all the necessary data between neighboring
+processes, such that each process gets a local copy of with
+
+::
+
+    local_gc_start[t] <= k[t] < local_gc_start[t] + local_ngc[t]
+
+where
+
+::
+
+    local_gc_start[t] = local_start[t] - gc_below[t];
+    local_ngc[t] = local_n[t] + gc_below[t] + gc_above[t];
+
+I.e., the local array block is increased in every dimension by elements
+below and elements above. Hereby, the is wrapped periodically whenever
+exceeds the array dimensions. The number of ghost cells in every
+dimension can be chosen independently and can be arbitrary large, i.e.,
+PFFT ghost cell communication also handles the case where the requested
+data exceeds next neighbor communication. The number of ghost cells can
+even be bigger than the array size, which results in multiple local
+copies of the same data elements at every process. However, the arrays
+and must be equal among all MPI processes.
+
+PFFT ghost cell communication can work on both, the input and output
+array distributions. Substitute and by and if you are interested in
+ghost cell communication of the input array. For ghost cell
+communication of the output array, substitute and by and .
+
+Using Ghost Cell Plans
+~~~~~~~~~~~~~~~~~~~~~~
+
+We introduce a new datatype that stores all the necessary information
+for ghost cell communication. Using a ghost cell plan follows the
+typical workflow: At first, determine the parallel data distribution;
+cf. Section [sec:gc:local-size]. Next, create a ghost cell plan; cf.
+Section [sec:gc:plan-cdata] and Section [sec:gc:plan-rdata]. Execute the
+ghost cell communication with one of the following two collective
+functions
+
+::
+
+    void pfft_exchange(
+        pfft_gcplan ths);
+    void pfft_reduce(
+        pfft_gcplan ths);
+
+Hereby, a ghost cell exchange creates duplicates of local data elements
+on next neighboring processes, while a ghost cell reduce is the adjoint
+counter part of the exchange, i.e., it adds the sum of all the
+duplicates of a local data element to the original data element.
+Finally, free the allocated memory with
+
+::
+
+    void pfft_destroy_gcplan(
+        pfft_gcplan ths);
+
+if the plan is not needed anymore. Passing a freed plan to or results in
+undefined behavior.
+
+Data Distribution
+~~~~~~~~~~~~~~~~~
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for computing the ghost cell data
+distribution:
+
+::
+
+    ptrdiff_t pfft_local_size_gc_3d(
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+    ptrdiff_t pfft_local_size_gc(
+        int rnk_n, 
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+    ptrdiff_t pfft_local_size_many_gc(
+        int rnk_n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        ptrdiff_t howmany,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
+
+Hereby, and must be the exactly same variables that were used for the
+PFFT plan creation. However, only the case is completely implemented at
+the moment. The local array size must be equal to or (computed by an
+appropriate call of ; cf. Section [sec:local-size]) depending on whether
+the ghost cell plan works on the FFT input or output array. Analogously,
+becomes or . The number of ghost cells is given by the two arrays and
+that must be equal among all MPI processes. All the ghost cell data
+distribution functions return the local array plus ghost cell size and
+the corresponding offset as two arrays of length . In addition, the
+return value gives the number of data elements that are necessary in
+order to store the array plus ghost cells.
+
+Note, that the array distribution functions do not distinguish between
+real and complex valued data. That is because and count array elements
+in units of complex or real depending on the transform. In addition, it
+does not matter if the local array is transposed or not, i.e., it is not
+necessary to pass the flags and to the ghost cell distribution function.
+In constrast, the ghost cell plan creation depends on the transform type
+as well as the transposition flags.
+
+Memory Allocation
+~~~~~~~~~~~~~~~~~
+
+In most applications we must ensure that the data array is large enough
+to suit the memory requirements of a parallel FFT and the ghost cell
+communication. The following two code snippets illustrate the correct
+allocation of memory in for complex valued and real valued arrays.
+
+::
+
+    /* Get parameters of data distribution */
+    /* alloc_local, local_no, local_o_start are given in complex units */
+    /* local_ni, local_i_start are given in real units */
+    alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
+        local_ni, local_i_start, local_no, local_o_start);
+
+    /* alloc_local_gc, local_ngc, local_gc_start are given in complex units */
+    alloc_local_gc = pfft_local_size_gc_3d(
+        local_no, local_o_start, gc_below, gc_above,
+        local_ngc, local_gc_start);
+
+    /* Allocate enough memory for FFT and ghost cells */
+    pfft_complex *cdata = pfft_alloc_complex(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local);
+
+Here, gives the number of data elements that are necessary to hold all
+steps of the parallel FFT, while gives the number of data elements that
+are necessary to hold all steps of the ghost cell communication. Note
+that we took the maximum of these both numbers as argument for . The
+code snippet for real valued arrays looks very similar.
+
+::
+
+    /* Get parameters of data distribution */
+    /* alloc_local, local_no, local_o_start are given in complex units */
+    /* local_ni, local_i_start are given in real units */
+    alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
+        local_ni, local_i_start, local_no, local_o_start);
+
+    /* alloc_local_gc, local_ngc, local_gc_start are given in real units */
+    alloc_local_gc = pfft_local_size_gc_3d(
+        local_ni, local_i_start, gc_below, gc_above,
+        local_ngc, local_gc_start);
+
+    /* Allocate enough memory for FFT and ghost cells */
+    double *rdata = pfft_alloc_real(alloc_local_gc > 2*alloc_local ? alloc_local_gc : 2*alloc_local);
+
+Note that the number of real valued data elements is given by two times
+for r2c transforms, whereas the last line would change into
+
+::
+
+    double *rdata = pfft_alloc_real(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local);
+
+for r2r transforms.
+
+Plan Creation for Complex Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions create ghost cell plans that operate on complex
+valued arrays, i.e.,
+
+c2c inputs,
+
+c2c outputs,
+
+r2c outputs (use flag ), and
+
+c2r inputs (use flag ).
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for creating a complex valued ghost cell
+plan:
+
+::
+
+    pfft_gcplan pfft_plan_cgc_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_cgc(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_many_cgc(
+        int rnk_n, const ptrdiff_t *n,
+        ptrdiff_t howmany, const ptrdiff_t *block,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
+
+Hereby, , , and must be the variables that were used for the PFFT plan
+creation. However, only the case is completely implemented at the
+moment. Remember that is the logical FFT size just as it is the case for
+FFT planning. The block size must be equal to or depending on whether
+the ghost cell plan works on the FFT input or output array. Analogously,
+becomes or . Set the number of ghost cells by and as described in
+Section [sec:gc]. The flags must be set appropriately to the flags that
+were passed to the FFT planner. Table [tab:map-cgcflags] shows the ghost
+cell planner flags that must be set in dependence on the listed FFT
+planner flags.
+
+[h]
+
++------------+-------------------+
+| FFT flag   | ghost cell flag   |
++============+===================+
++------------+-------------------+
++------------+-------------------+
++------------+-------------------+
+
+[tab:map-cgcflags]
+
+In addition, we introduce the flag (and its equivalent ) to handle the
+complex array storage format of r2c and c2r transforms. In fact, these
+two flags imply an ordinary complex valued ghost cell communication on
+an array of size . Please note that we wrongly assume periodic boundary
+conditions in this case. Therefore, you should ignore the data elements
+with the last index behind .
+
+Plan Creation for Real Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions create ghost cell plans that operate on real
+valued arrays, i.e.,
+
+r2r inputs,
+
+r2r outputs,
+
+r2c inputs, and
+
+c2r outputs.
+
+Corresponding to the three interface layers for FFT planning, there are
+the following three layers for creating a real valued ghost cell plan:
+
+::
+
+    pfft_gcplan pfft_plan_rgc_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_rgc(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+    pfft_gcplan pfft_plan_many_rgc(
+        int rnk_n, const ptrdiff_t *n,
+        ptrdiff_t howmany, const ptrdiff_t *block,
+        const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
+        double *data, MPI_Comm comm_cart, unsigned gc_flags);
+
+Hereby, , , and must be the variables that were used for the PFFT plan
+creation. Remember that is the logical FFT size just as it is the case
+for FFT planning. The block size must be equal to or depending on
+whether the ghost cell plan works on the FFT input or output array.
+Analogously, becomes or . Set the number of ghost cells by and as
+described in Section [sec:gc:local-size]. The flags must be set
+appropriately to the flags that were passed to the FFT planner.
+Table [tab:map-rgcflags] shows the ghost cell planner flags that must be
+set in dependence on the listed FFT planner flags.
+
+[h]
+
++------------+-------------------+
+| FFT flag   | ghost cell flag   |
++============+===================+
++------------+-------------------+
++------------+-------------------+
++------------+-------------------+
++------------+-------------------+
++------------+-------------------+
+
+[tab:map-rgcflags]
+
+Note that the flag (or its equivalent ) implies an ordinary real valued
+ghost cell communication on an array of size . Especially, the padding
+elements will be handles as normal data points, i.e., you must we aware
+that the numbers of ghost cells and include the number of padding
+elements.
+
+Inofficial Flags
+~~~~~~~~~~~~~~~~
+
+Ghost Cell Execution Timer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PFFT ghost cell plans automatically accumulate the local run times of
+every call to and . For most applications it is sufficient to print run
+time of a plan averaged over all runs with
+
+::
+
+    void pfft_print_average_gctimer(
+        const pfft_gcplan ths, MPI_Comm comm);
+
+Note, that for each timer the maximum time over all processes is reduced
+to rank of communicator , i.e., a call to is performed and the output is
+only printed on this process. The following function works in the same
+way but prints more verbose output
+
+::
+
+    void pfft_print_average_gctimer_adv(
+        const pfft_gcplan ths, MPI_Comm comm);
+
+To write the averaged run time of a ghost cell plan into a file called
+use
+
+::
+
+    void pfft_write_average_gctimer(
+        const pfft_gcplan ths, const char *name, MPI_Comm comm);
+    void pfft_write_average_gctimer_adv(
+        const pfft_gcplan ths, const char *name, MPI_Comm comm);
+
+Again, the output is only written on rank of communicator .
+
+Discard all the recorded run times with
+
+::
+
+    void pfft_reset_gctimers(
+        pfft_gcplan ths);
+
+This function is called per default at the end of every ghost cell plan
+creation function.
+
+In order to access the run times directly a new typedef is introduced.
+The following functions return a copy of the timer corresponding to
+ghost cell plan that accumulated the time for ghost cell exchange or
+ghost cell reduce, respectively:
+
+::
+
+    pfft_gctimer pfft_get_gctimer_exg(
+        const pfft_gcplan ths);
+    pfft_gctimer pfft_get_gctimer_red(
+        const pfft_gcplan ths);
+
+Note that the memory of the returned must be released with
+
+::
+
+    void pfft_destroy_gctimer(
+        pfft_gctimer ths);
+
+as soon as the timer is not needed anymore.
+
+In the following we introduce some routines to perform basic operations
+on timers. For all functions with a return value you must use in order
+to release the allocated memory of the timer. Create a copy of a ghost
+cell timer with
+
+::
+
+    pfft_gctimer pfft_copy_gctimer(
+        const pfft_gctimer orig);
+
+Compute the average, local time over all runs of a timer with
+
+::
+
+    void pfft_average_gctimer(
+        pfft_gctimer ths);
+
+Create a new timer that contains the sum of two timers and with
+
+::
+
+    pfft_gctimer pfft_add_gctimers(
+        const pfft_gctimer sum1, const pfft_gctimer sum2);
+
+Create a timer that contains the maximum times of all the timers from
+all processes belonging to communicator with
+
+::
+
+    pfft_gctimer pfft_reduce_max_gctimer(
+        const pfft_gctimer ths, MPI_Comm comm);
+
+Since this function calls , only the first process (rank 0) of will get
+the desired data while all the other processes have timers with
+undefined values.
+
+Note, that you can not access the elements of a timer directly, since it
+is only a pointer to a . However, PFFT offers a routine that creates an
+array and copies all the entries of the timer into it
+
+::
+
+    void pfft_convert_gctimer2vec(
+        const pfft_gctimer ths, double *times);
+
+Remember to use in order to release the allocated memory of the returned
+array at the moment it is not needed anymore. The entries of the
+returned array are ordered as follows:
+
+number of runs
+
+local run time of all runs
+
+local run time of zero padding (make room for incoming ghost cells and
+init with zeros)
+
+local run time of the ghost cell exchange or reduce (depending on the
+timer)
+
+The complementary function
+
+::
+
+    pfft_gctimer pfft_convert_vec2gctimer(
+        const double *times);
+
+creates a timer and fills it’s entries with the data from array .
+Thereby, the entries of must be in the same order as above.
+
+Useful Tools
+------------
+
+The following functions are useful tools but are not necessarily needed
+to perform parallel FFTs.
+
+Initializing Complex Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To fill a complex array with reproducible, complex values you can use
+one of the functions
+
+::
+
+    void pfft_init_input_complex_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_init_input_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Hereby, the arrays , and of length ( for ) give the size of the FFT, the
+local array size and the local array offset as computed by the array
+distribution functions described in Section [sec:local-size] The
+functions
+
+::
+
+    double pfft_check_output_complex_3d(
+        const ptrdiff_t *n, 
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array and values
+produced by , . In addition, we supply the following functions for
+setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_complex_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_clear_input_complex(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Note, that these functions can be combined for a quick consistency check
+of the FFT. Since a forward FFT followed by a backward FFT reproduces
+the inputs up to a scaling factor, the following code snippet should
+give a result equal to zero up to machine precision.
+
+::
+
+    /* Initialize input with random numbers */
+    pfft_init_input_complex_3d(n, local_ni, local_i_start,
+        in);
+
+    /* execute parallel forward FFT */
+    pfft_execute(plan_forw);
+
+    /* clear the old input */
+    if(in != out) 
+      pfft_clear_input_complex_3d(n, local_ni, local_i_start, in);
+
+    /* execute parallel backward FFT */
+    pfft_execute(plan_back);
+
+    /* Scale data */
+    for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
+      in[l] /= (n[0]*n[1]*n[2]);
+
+    /* Print error of back transformed data */
+    err = pfft_check_output_complex_3d(n, local_ni, local_i_start, in, comm_cart_2d);
+    pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]);
+    pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
+
+Hereby, we set all inputs equal to zero after the forward FFT in order
+to be sure that all the final results are actually computed by the
+backward FFT instead of being a buggy relict of the forward transform.
+
+Initializing Real Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To fill a real array with reproducible, real values use one of the
+functions
+
+::
+
+    void pfft_init_input_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_init_input_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Hereby, the arrays , and give the size of the FFT, the local array size
+and the local array offset as computed by the array distribution
+functions described in Section [sec:local-size] The functions
+
+::
+
+    double pfft_check_output_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array and values
+produced by , . In addition, we supply the following functions for
+setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_real_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_clear_input_real(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Note, that both functions will set all array elements to zero were . In
+addition, both function will ignore all the errors resulting from these
+elements. Therefore, it is safe to use all these functions for a
+consistency check of a r2c transform followed by a c2r transform since
+all padding elements will be ignored.
+
+Initializing r2c/c2r Inputs and Checking Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The real inputs of a r2c transform can be initialized with the functions
+decribed in Section [sec:init-data-3d-r2r]. However, generating suitable
+inputs for a c2r transform requires more caution. In order to get real
+valued results of a DFT the complex input coefficients need to satisfy
+an radial Hermitian symmetry, i.e.,
+:math:`X[\mathbf k] = {X^*[-\mathbf k]}`. We use the following trick to
+generate the complex input values for c2r transforms. Assume any
+:math:`\mathbf N`-periodic complex valued function :math:`f`. It can be
+easily shown that the values
+:math:`X[\mathbf k] := \frac{1}{2}\left(f(\mathbf k)+f^*(-\mathbf k)\right)`
+satisfy the radial Hermitian symmetry.
+
+To fill a complex array with reproducible, complex values that fulfill
+the radial Hermitian symmetry use one of the functions
+
+::
+
+    void pfft_init_input_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        double *data);
+    void pfft_init_input_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        double *data);
+
+Hereby, the arrays , and give the size of the FFT, the local array size
+and the local array offset as computed by the array distribution
+functions described in Section [sec:local-size] The functions
+
+::
+
+    double pfft_check_output_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        const pfft_complex *data, MPI_Comm comm);
+    double pfft_check_output_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const pfft_complex *data, MPI_Comm comm);
+
+compute the :math:`l_1`-norm between the elements of array and values
+produced by , . In addition, we supply the following functions for
+setting all the input data to zero at once
+
+::
+
+    void pfft_clear_input_complex_hermitian_3d(
+        const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_n_start,
+        pfft_complex *data);
+    void pfft_clear_input_complex_hermitian(
+        int rnk_n, const ptrdiff_t *n,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        pfft_complex *data);
+
+Note, that these functions can also be used in order to generate complex
+inputs with radial Hermitian symmetry for ordinary c2c transforms. Of
+course the results of such a c2c DFT will have all imaginary parts equal
+to zero up to machine precision.
+
+Operations on Arrays of Type 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following routines are shortcuts for the elementwise manipulation of
+valued arrays. In the following, all arrays , , and are of length and
+type .
+
+::
+
+    ptrdiff_t pfft_prod_INT(
+        int d, const ptrdiff_t *vec);
+
+Returns the product over all elements of .
+
+::
+
+    ptrdiff_t pfft_sum_INT(
+        int d, const ptrdiff_t *vec);
+
+Returns the sum over all elements of .
+
+::
+
+    int pfft_equal_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2);
+
+Returns 1 if both arrays have equal entries, 0 otherwise.
+
+::
+
+    void pfft_vcopy_INT(
+        int d, const ptrdiff_t *vec1,
+        ptrdiff_t *vec2);
+
+Copies the elements of into .
+
+::
+
+    void pfft_vadd_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
+        ptrdiff_t *sum);
+
+Fills with the componentwise sum of and .
+
+::
+
+    void pfft_vsub_INT(
+        int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
+        ptrdiff_t *sum);
+
+Fills with the componentwise difference of and .
+
+Print Three-Dimensional Arrays in Parallel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the following routine to print the elements of a block decomposed
+three-dimensional (real or complex valued) array in a nicely formatted
+way.
+
+::
+
+    void pfft_apr_real_3d(
+        const double *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const char *name, MPI_Comm comm);
+    void pfft_apr_complex_3d(
+        const pfft_complex *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        const char *name, MPI_Comm comm);
+
+Obviously, this makes only sense for arrays of moderate size. The block
+decomposition is given by , as returned by the array distribution
+function decribed in Section [sec:local-size]. Furthermore, some
+arbitrary string can be added at the beginning of each output -
+typically this will be the name of the array. Communicator must be
+suitable to the block decomposition and is used to synchronize the
+outputs over all processes.
+
+Generalizations for the case where the dimensions of the local arrays
+are permuted are given by
+
+::
+
+    void pfft_apr_real_permuted_3d(
+        const double *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        int perm0, int perm1, int perm2,
+        const char *name, MPI_Comm comm);
+    void pfft_apr_complex_permuted_3d(
+        const pfft_complex *data,
+        const ptrdiff_t *local_n, const ptrdiff_t *local_start,
+        int perm0, int perm1, int perm2,
+        const char *name, MPI_Comm comm);
+
+Hereby, , , and give the array’s permutation of dimension.
+
+Reading Command Line Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following function offers a simple way to read command line
+arguments into an array .
+
+::
+
+    void pfft_get_args(
+        int argc, char **argv, const char *name,
+        int neededArgs, unsigned type,
+        void *parameter);
+
+Hereby, and are the standard argument of the routine. Furthermore, , ,
+and give the name, number of entries and the type of the command line
+argument. Supported types are , , , , and , which denote the standard C
+type that is used for typecasting. In addition, you can use the special
+type that is an integer type equal to one if the corresponding command
+line argument is given. The array must be of sufficient size to hold
+elements of the given data type. Special attention is given
+
+For example, a program containing the following code snippet
+
+::
+
+    double x=0.1;
+    pfft_get_args(argc, argv, "-pfft_x", 1, PFFT_DOUBLE, &x);
+    int np[2]={2,1};
+    pfft_get_args(argc, argv, "-pfft_np", 2, PFFT_INT, np);
+    ptrdiff_t n[3]={32,32,32};
+    pfft_get_args(argc, argv, "-pfft_n", 3, PFFT_PTRDIFF_T, n);
+    int switch=0;
+    pfft_get_args(argc, argv, "-pfft_on", 0, PFFT_SWITCH, switch);
+
+that is executed via
+
+::
+
+    ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on
+
+will read , , , and turn on the . Note the address operator in front of
+in the second line! Furthermore, note that the initialization of all
+variables with default values before the call of avoids trouble if the
+user does not provide all the command line arguments.
+
+Parallel Substitutes for , , and 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions are similar to the standard C function , and
+with the exception, that only rank within the given communicator will
+produce output. The intension is to avoid the flood of messages that is
+produced when simple statement are run in parallel.
+
+::
+
+    void pfft_vfprintf(
+        MPI_Comm comm, FILE *stream, const char *format, va_list ap);
+    void pfft_fprintf(
+        MPI_Comm comm, FILE *stream, const char *format, ...);
+    void pfft_printf(
+        MPI_Comm comm, const char *format, ...);
+
+Generating Periodic Cartesian Communicators
+-------------------------------------------
+
+Based on the processes that are part of the given communicator the
+following routine
+
+::
+
+    int pfft_create_procmesh_1d(
+        MPI_Comm comm, int np0,
+        MPI_Comm *comm_cart_1d);
+
+allocates and creates a one-dimensional, periodic, Cartesian
+communicator of size . Thereby, a non-zero error code is returned
+whenever does not fit the size of . The memory of the generated
+communicator should be released with after usage. Analogously, use
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+in order to allocate and create two-dimensional, periodic, Cartesian
+communicator of size or
+
+::
+
+    int pfft_create_procmesh(
+        int rnk_np, MPI_Comm comm, const int *np,
+        MPI_Comm *comm_cart);
+
+in order to allocate and create a -dimensional, periodic, Cartesian
+communicator of size . Hereby, is an array of length . Again, the memory
+of the generated communicator should be released with after usage.
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 28fcf9f..dd1a641 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1,437 +1,535 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{Tutorial}\label{chap:tuto}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-The following chapter describes the usage of the PFFT library at the example of a simple test file in the first section,
-followed by the more advanced features of PFFT in the next sections.
-
-\section{A first parallel transform - Three-dimensional FFT with two-dimensional data decomposition}
-We explain the basic steps for computing a parallel FFT with the PFFT library at the example
-of the short test program given by Listing~\ref{lst:man_c2c}. This test computes a three-dimensional c2c-FFT on
-a two-dimensional process mesh. The source code \code{manual_c2c_3d.c} can be found in directory \code{tests/}
-of the library's source code tree. 
-\lstinputlisting[numbers=left, float, caption={Minimal parallel c2c-FFT test program.}, label=lst:man_c2c]{../tests/manual_c2c_3d.c}
-
-After initializing MPI with \code{MPI_Init} and before calling any other PFFT routine initialize
-the parallel FFT computations via
-\begin{lstlisting}
-void pfft_init(void);
-\end{lstlisting}
-MPI introduces the concept of communicators to store all the topological information of the physical process layout.
-PFFT requires to be called on a process mesh that corresponds to a periodic, Cartesian communicator.
-We assist the user in creating such a communicator with the following routine
-\begin{lstlisting}
-int pfft_create_procmesh_2d(
-    MPI_Comm comm, int np0, int np1,
-    MPI_Comm *comm_cart_2d);
-\end{lstlisting}
-This routine uses the processes within the communicator \code{comm} to create a two-dimensional process
-grid of size \code{np0} x \code{np1} and stores it into the Cartesian communicator \code{comm_cart_2d}.
-Note that \code{comm_cart_2d} is allocated by the routine and must be freed with \code{MPI_Comm_free} after usage.
-The input parameter \code{comm} is a communicator, indicating which processes will participate in the transform.
-Choosing \code{comm} as \code{MPI_COMM_WORLD} implies that the FFT is computed on all available processes.
-
-At the next step we need to know the data decomposition of the input and output array, that depends on
-the array sizes, the process grid and the chosen parallel algorithm. Therefore, we call
-\begin{lstlisting}
-ptrdiff_t pfft_local_size_3d(
-    ptrdiff_t *n, MPI_Comm comm_cart_2d, unsigned pfft_flags,
-    ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    ptrdiff_t *local_no, ptrdiff_t *local_o_start);
-\end{lstlisting}
-Hereby, \code{n}, \code{local_ni}, \code{local_i_start}, \code{local_no}, \code{local_o_start} are arrays of length $3$ that must be allocated.
-The return value of this function equals the size of the local complex array that needs to be allocated by every process.
-In most cases, this coincides with the product of the local array sizes -- but may be bigger,
-whenever the parallel algorithm needs some extra storage.
-The input value \code{n} gives the three-dimensional FFT size and the flag \code{pfft_flags} serves to adjust
-some details of the parallel execution. For the sake of simplicity, we restrict our self to the case
-\code{pfft_flags=PFFT_TRANSPOSED_NONE} for a while and explain the more sophisticated flags at a later point.
-The output arrays \code{local_ni} and \code{local_i_start} give the size and the offset of the local input array
-that result from the parallel block distribution of the global input array, i.e.,
-every process owns the input data \code{in[k[0],k[1],k[2]]} with \code{local_i_start[t] <= k[t] < local_i_start[t] + local_ni[t]}
-for \code{t=0,1,2}. Analogously, the output parameters \code{local_o_start} and \code{local_no} contain the size
-and the offset of the local output array.
+Tutorial
+========
+
+The following chapter describes the usage of the PFFT library at the
+example of a simple test file in the first section, followed by the more
+advanced features of PFFT in the next sections.
+
+A first parallel transform - Three-dimensional FFT with two-dimensional data decomposition
+------------------------------------------------------------------------------------------
+
+We explain the basic steps for computing a parallel FFT with the PFFT
+library at the example of the short test program given by
+Listing [lst:man\ :sub:`c`\ 2c]. This test computes a three-dimensional
+c2c-FFT on a two-dimensional process mesh. The source code can be found
+in directory of the library’s source code tree.
+
+After initializing MPI with and before calling any other PFFT routine
+initialize the parallel FFT computations via
+
+::
+
+    void pfft_init(void);
+
+MPI introduces the concept of communicators to store all the topological
+information of the physical process layout. PFFT requires to be called
+on a process mesh that corresponds to a periodic, Cartesian
+communicator. We assist the user in creating such a communicator with
+the following routine
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+This routine uses the processes within the communicator to create a
+two-dimensional process grid of size x and stores it into the Cartesian
+communicator . Note that is allocated by the routine and must be freed
+with after usage. The input parameter is a communicator, indicating
+which processes will participate in the transform. Choosing as implies
+that the FFT is computed on all available processes.
+
+At the next step we need to know the data decomposition of the input and
+output array, that depends on the array sizes, the process grid and the
+chosen parallel algorithm. Therefore, we call
+
+::
+
+    ptrdiff_t pfft_local_size_3d(
+        ptrdiff_t *n, MPI_Comm comm_cart_2d, unsigned pfft_flags,
+        ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+Hereby, , , , , are arrays of length :math:`3` that must be allocated.
+The return value of this function equals the size of the local complex
+array that needs to be allocated by every process. In most cases, this
+coincides with the product of the local array sizes – but may be bigger,
+whenever the parallel algorithm needs some extra storage. The input
+value gives the three-dimensional FFT size and the flag serves to adjust
+some details of the parallel execution. For the sake of simplicity, we
+restrict our self to the case for a while and explain the more
+sophisticated flags at a later point. The output arrays and give the
+size and the offset of the local input array that result from the
+parallel block distribution of the global input array, i.e., every
+process owns the input data with for . Analogously, the output
+parameters and contain the size and the offset of the local output
+array.
 
 Afterward, the input and output arrays must be allocated. Hereby,
-\begin{lstlisting}
-pfft_complex* pfft_alloc_complex(size_t size);
-\end{lstlisting}
-is a simple wrapper of \code{fftw_alloc_complex}, which in turn allocates the memory via \code{fftw_malloc} to ensure proper alignment for SIMD.
-Have a look at the FFTW user manual~\cite{fftw-align-mem} for more details on SIMD memory alignment and \code{fftw_malloc}.
-Nevertheless, you can also use any other dynamic memory allocation.
-
-The planning of a single three-dimensional parallel FFT of size \code{n[0]} x \code{n[1]} x \code{n[2]}
-is done by the function
-\begin{lstlisting}
-pfft_plan pfft_plan_dft_3d(
-    ptrdiff_t *n, pfft_complex *in, pfft_complex *out,
-    MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags);
-\end{lstlisting}
-We provide the address of the input and output array by the pointers \code{in} and \code{out},
-respectively. An inplace transform is assumed if these pointers are equal.
-The integer \code{sign} gives the sign in the exponential of the FFT. Possible values are \code{PFFT_FORWARD} ($-1$)
-and \code{PFFT_BACKWARD} ($+1$).
-Flags passed to the planner via \code{pfft\_flags} must coincide with the flags that were passed to \code{pfft_local_size_3d}.
-Otherwise the data layout of the parallel execution may not match calculated local array sizes.
-As return value we get a PFFT plan, some structure that stores all the information needed to perform a parallel FFT.
-
-Once the plan is generated, we are allowed to fill the input array \code{in}. Note, that per default the planning step
-\code{pfft_plan_dft_3d} will overwrite input array \code{in}. Therefore, you should not write any sensitive data into \code{in} until the plan was generated.
-For simplicity, our test program makes use of the library function
-\begin{lstlisting}
-void pfft_init_input_complex_3d(
-    ptrdiff_t *n, ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
-    pfft_complex *in);
-\end{lstlisting}
-to fill the input array with some numbers. Alternatively, one can fill the array with a function \code{func} of choice
-and the following loop that takes account of the parallel data layout
-\begin{lstlisting}
-ptrdiff_t m=0;
-for(ptrdiff_t k0=0; k0 < local_ni[0]; k0++)
-  for(ptrdiff_t k1=0; k1 < local_ni[1]; k1++)
-    for(ptrdiff_t k2=0; k2 < local_ni[2]; k2++)
-      in[m++] = func(k0 + local_i_start[0],
-                     k1 + local_i_start[1],
-                     k2 + local_i_start[2]);
-\end{lstlisting}
+
+::
+
+    pfft_complex* pfft_alloc_complex(size_t size);
+
+is a simple wrapper of , which in turn allocates the memory via to
+ensure proper alignment for SIMD. Have a look at the FFTW user manual 
+for more details on SIMD memory alignment and . Nevertheless, you can
+also use any other dynamic memory allocation.
+
+The planning of a single three-dimensional parallel FFT of size x x is
+done by the function
+
+::
+
+    pfft_plan pfft_plan_dft_3d(
+        ptrdiff_t *n, pfft_complex *in, pfft_complex *out,
+        MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags);
+
+We provide the address of the input and output array by the pointers and
+, respectively. An inplace transform is assumed if these pointers are
+equal. The integer gives the sign in the exponential of the FFT.
+Possible values are (:math:`-1`) and (:math:`+1`). Flags passed to the
+planner via must coincide with the flags that were passed to . Otherwise
+the data layout of the parallel execution may not match calculated local
+array sizes. As return value we get a PFFT plan, some structure that
+stores all the information needed to perform a parallel FFT.
+
+Once the plan is generated, we are allowed to fill the input array .
+Note, that per default the planning step will overwrite input array .
+Therefore, you should not write any sensitive data into until the plan
+was generated. For simplicity, our test program makes use of the library
+function
+
+::
+
+    void pfft_init_input_complex_3d(
+        ptrdiff_t *n, ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+        pfft_complex *in);
+
+to fill the input array with some numbers. Alternatively, one can fill
+the array with a function of choice and the following loop that takes
+account of the parallel data layout
+
+::
+
+    ptrdiff_t m=0;
+    for(ptrdiff_t k0=0; k0 < local_ni[0]; k0++)
+      for(ptrdiff_t k1=0; k1 < local_ni[1]; k1++)
+        for(ptrdiff_t k2=0; k2 < local_ni[2]; k2++)
+          in[m++] = func(k0 + local_i_start[0],
+                         k1 + local_i_start[1],
+                         k2 + local_i_start[2]);
+
 The parallel FFT is computed when we execute the generated plan via
-\begin{lstlisting}
-void pfft_execute(const pfft_plan plan);
-\end{lstlisting}
-Now, the results can be read from \code{out} with an analogous three-dimensional loop.
-If we do not want to execute another parallel FFT of the same type, we free the allocated memory of the plan with
-\begin{lstlisting}
-void pfft_destroy_plan(pfft_plan plan);
-\end{lstlisting}
+
+::
+
+    void pfft_execute(const pfft_plan plan);
+
+Now, the results can be read from with an analogous three-dimensional
+loop. If we do not want to execute another parallel FFT of the same
+type, we free the allocated memory of the plan with
+
+::
+
+    void pfft_destroy_plan(pfft_plan plan);
+
 Additionally, we use
-\begin{lstlisting}
-int MPI_Comm_free(MPI_Comm *comm);  
-\end{lstlisting}
-to free the communicator allocated by \code{pfft_create_procmesh_2d} and
-\begin{lstlisting}
-void pfft_free(void *ptr);
-\end{lstlisting}
-to free memory allocated by \code{pfft_alloc_complex}.
-Finally, we exit MPI via
-\begin{lstlisting}
-int MPI_Finalize(void);
-\end{lstlisting}
-
-
-\section{Porting FFTW-MPI based code to PFFT}\label{sec:porting}
-\todo[inline]{finish FFTW2PFFT porting example}
-We illustrate the close connection between FFTW-MPI and PFFT at a three-dimensional MPI example analogous to the example given in the FFTW manual~\cite{fftw-2dmpi}.
-\lstinputlisting[numbers=left, float, caption={Minimal parallel c2c-FFT test program.}, label=lst:fftw_3don1d]{man_fftw_3don1d.tex}
-
-Exactly the same task can be performed with PFFT as given in Listing~\ref{lst:pfft_3don1d}.
-\begin{lstlisting}
-#include <pfft.h>
-     
-int main(int argc, char **argv)
-{
-    const ptrdiff_t n[3] = {..., ..., ...};
-    pfft_plan plan;
-    pfft_complex *data;
-    ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
-    unsigned pfft_flags = 0;
-
-    MPI_Init(&argc, &argv);
-    pfft_init();
-
-    /* get local data size and allocate */
-    alloc_local = pfft_local_size_dft_3d(n, MPI_COMM_WORLD, pfft_flags,
-				         local_ni, local_i_start,
-				         local_no, local_o_start);
-    data = pfft_alloc_complex(alloc_local);
-
-    /* create plan for in-place forward DFT */
-    plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
-			    PFFT_FORWARD, PFFT_ESTIMATE);
-
-    /* initialize data to some function my_function(x,y,z) */
-    for (i = 0; i < local_n[0]; ++i) 
-      for (j = 0; j < n[1]; ++j) 
-        for (k = 0; k < n[2]; ++k)
-          data[i*n[1]*n[2] + j*n[2] + k] = my_function(local_i_start[0] + i, j, k);
-
-    /* compute transforms, in-place, as many times as desired */
-    pfft_execute(plan);
-
-    pfft_destroy_plan(plan);
-
-    MPI_Finalize();
-}
-\end{lstlisting}
-
-
-
-\begin{compactitem}
-  \item substitute \code{fftw3-mpi.h} by \code{pfft.h}
-  \item substitute all prefixes \code{fftw_} and \code{fftw_mpi_} by \code{pfft_}
-  \item substitute all prefixes \code{FFTW_} by \code{PFFT_}
-  \item the integers \code{N}, \code{local_n0}, \code{local_0_start} become arrays of length 3
-  \item \code{dft_} in \code{pfft_local_size_dft_3d}
-  \item \code{pfft_local_size_dft_3d} has additional input \code{pfft_flags} and additional outputs \code{local_no}, \code{local_o_start}
-  \item The loop that inits \code{data} becomes splitted along all three dimensions. We could also use 
-  
-  
-\end{compactitem}
-
-
-First, All prefixes \code{fftw_} are substituted by \code{pfft_}
-
-Now, the changes in order to use a two-dimensional process mesh are marginal as can be seen in Listing~\ref{lst:pfft_3don2d}.
-\begin{lstlisting}
-#include <pfft.h>
-     
-int main(int argc, char **argv)
-{
-    const ptrdiff_t n[3] = {..., ..., ...};
-    (red@const int np0 = ..., np1 = ...;@*)
-    pfft_plan plan;
-    pfft_complex *data;
-    ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
-    unsigned pfft_flags = 0;
-    (red@MPI_Comm comm_cart_2d;@*)
-
-    MPI_Init(&argc, &argv);
-    pfft_init();
-
-    (red@/* create two-dimensional process grid of size np0 x np1 */@*)
-    (red@pfft_create_procmesh_2d(MPI_COMM_WORLD, np0, np1,@*)
-        (red@&comm_cart_2d);@*)
-    
-    /* get local data size and allocate */
-    alloc_local = pfft_local_size_dft_3d(n, (red@comm_cart_2d@*), pfft_flags,
-				         local_ni, local_i_start,
-				         local_no, local_o_start);
-    data = pfft_alloc_complex(alloc_local);
-
-    /* create plan for in-place forward DFT */
-    plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
-			    PFFT_FORWARD, PFFT_ESTIMATE);
-
-    /* initialize data to some function my_function(x,y,z) */
-    for (i = 0; i < local_n[0]; ++i) 
-      for (j = 0; j < (red@local_n[1]@*); ++j) 
-        for (k = 0; k < (red@local_n[2]@*); ++k)
-          data[i*(red@local_n[1]*local_n[2]@*) + j*(red@local_n[2]@*) + k] =
-              my_function(local_i_start[0] + i,
-		          (red@local_i_start[1] +@*) j,
-		          (red@local_i_start[2] +@*) k);
-
-    /* compute transforms, in-place, as many times as desired */
-    pfft_execute(plan);
 
-    pfft_destroy_plan(plan);
-
-    MPI_Finalize();
-}
-\end{lstlisting}
+::
+
+    int MPI_Comm_free(MPI_Comm *comm);  
+
+to free the communicator allocated by and
+
+::
+
+    void pfft_free(void *ptr);
+
+to free memory allocated by . Finally, we exit MPI via
+
+::
+
+    int MPI_Finalize(void);
+
+Porting FFTW-MPI based code to PFFT
+-----------------------------------
+
+We illustrate the close connection between FFTW-MPI and PFFT at a
+three-dimensional MPI example analogous to the example given in the FFTW
+manual .
+
+Exactly the same task can be performed with PFFT as given in
+Listing [lst:pfft\ :sub:`3`\ don1d].
+
+::
+
+    #include <pfft.h>
+         
+    int main(int argc, char **argv)
+    {
+        const ptrdiff_t n[3] = {..., ..., ...};
+        pfft_plan plan;
+        pfft_complex *data;
+        ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+        unsigned pfft_flags = 0;
+
+        MPI_Init(&argc, &argv);
+        pfft_init();
+
+        /* get local data size and allocate */
+        alloc_local = pfft_local_size_dft_3d(n, MPI_COMM_WORLD, pfft_flags,
+                             local_ni, local_i_start,
+                             local_no, local_o_start);
+        data = pfft_alloc_complex(alloc_local);
+
+        /* create plan for in-place forward DFT */
+        plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+                    PFFT_FORWARD, PFFT_ESTIMATE);
+
+        /* initialize data to some function my_function(x,y,z) */
+        for (i = 0; i < local_n[0]; ++i) 
+          for (j = 0; j < n[1]; ++j) 
+            for (k = 0; k < n[2]; ++k)
+              data[i*n[1]*n[2] + j*n[2] + k] = my_function(local_i_start[0] + i, j, k);
+
+        /* compute transforms, in-place, as many times as desired */
+        pfft_execute(plan);
+
+        pfft_destroy_plan(plan);
+
+        MPI_Finalize();
+    }
+
+substitute by
+
+substitute all prefixes and by
+
+substitute all prefixes by
+
+the integers , , become arrays of length 3
+
+in
+
+has additional input and additional outputs ,
+
+The loop that inits becomes splitted along all three dimensions. We
+could also use
+
+First, All prefixes are substituted by
+
+Now, the changes in order to use a two-dimensional process mesh are
+marginal as can be seen in Listing [lst:pfft\ :sub:`3`\ don2d].
 
+::
 
+    #include <pfft.h>
+         
+    int main(int argc, char **argv)
+    {
+        const ptrdiff_t n[3] = {..., ..., ...};
+        (red@const int np0 = ..., np1 = ...;@*)
+        pfft_plan plan;
+        pfft_complex *data;
+        ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k;
+        unsigned pfft_flags = 0;
+        (red@MPI_Comm comm_cart_2d;@*)
 
+        MPI_Init(&argc, &argv);
+        pfft_init();
 
+        (red@/* create two-dimensional process grid of size np0 x np1 */@*)
+        (red@pfft_create_procmesh_2d(MPI_COMM_WORLD, np0, np1,@*)
+            (red@&comm_cart_2d);@*)
+        
+        /* get local data size and allocate */
+        alloc_local = pfft_local_size_dft_3d(n, (red@comm_cart_2d@*), pfft_flags,
+                             local_ni, local_i_start,
+                             local_no, local_o_start);
+        data = pfft_alloc_complex(alloc_local);
 
+        /* create plan for in-place forward DFT */
+        plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD,
+                    PFFT_FORWARD, PFFT_ESTIMATE);
 
+        /* initialize data to some function my_function(x,y,z) */
+        for (i = 0; i < local_n[0]; ++i) 
+          for (j = 0; j < (red@local_n[1]@*); ++j) 
+            for (k = 0; k < (red@local_n[2]@*); ++k)
+              data[i*(red@local_n[1]*local_n[2]@*) + j*(red@local_n[2]@*) + k] =
+                  my_function(local_i_start[0] + i,
+                      (red@local_i_start[1] +@*) j,
+                      (red@local_i_start[2] +@*) k);
+
+        /* compute transforms, in-place, as many times as desired */
+        pfft_execute(plan);
+
+        pfft_destroy_plan(plan);
+
+        MPI_Finalize();
+    }
+
+Errorcode for communicator creation
+-----------------------------------
 
-\section{Errorcode for communicator creation}
 As we have seen the function
-\begin{lstlisting}
-int pfft_create_procmesh_2d(
-    MPI_Comm comm, int np0, int np1,
-    MPI_Comm *comm_cart_2d);
-\end{lstlisting}
-creates a two-dimensional, periodic, Cartesian communicator. The \code{int} return value
-(not used in Listing~\ref{lst:man_c2c}) is the forwarded error code of \code{MPI_Cart_create}.
-It is equal to zero if the communicator was created successfully.
-The most common error is that the number of processes within the input
-communicator \code{comm} does not fit \code{np0 x np1}. In this case the Cartesian communicator
-is not generated and the return value is unequal to zero. Therefore, a typical sanity check might look like
-\begin{lstlisting}
-/* Create two-dimensional process grid of size np[0] x np[1],
-   if possible */
-if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1],
-        &comm_cart_2d) )
-{
-  pfft_fprintf(MPI_COMM_WORLD, stderr,
-      "Error: This test file only works with %d processes.\n",
-      np[0]*np[1]);
-  MPI_Finalize();
-  return 1;
-}
-\end{lstlisting}
+
+::
+
+    int pfft_create_procmesh_2d(
+        MPI_Comm comm, int np0, int np1,
+        MPI_Comm *comm_cart_2d);
+
+creates a two-dimensional, periodic, Cartesian communicator. The return
+value (not used in Listing [lst:man\ :sub:`c`\ 2c]) is the forwarded
+error code of . It is equal to zero if the communicator was created
+successfully. The most common error is that the number of processes
+within the input communicator does not fit . In this case the Cartesian
+communicator is not generated and the return value is unequal to zero.
+Therefore, a typical sanity check might look like
+
+::
+
+    /* Create two-dimensional process grid of size np[0] x np[1],
+       if possible */
+    if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1],
+            &comm_cart_2d) )
+    {
+      pfft_fprintf(MPI_COMM_WORLD, stderr,
+          "Error: This test file only works with %d processes.\n",
+          np[0]*np[1]);
+      MPI_Finalize();
+      return 1;
+    }
+
 Hereby, we use the PFFT library function
-\begin{lstlisting}
-void pfft_fprintf(
-    MPI_Comm comm, FILE *stream, const char *format, ...);
-\end{lstlisting}
-to print the error message.
-This function is similar to the standard C function \code{fprintf} with the exception, that only the process with MPI rank $0$
-within the given communicator \code{comm} will produce some output; see Section~\ref{sec:fprintf} for details.
-
-\section{Inplace transforms}
-Similar to FFTW, PFFT is able to compute parallel FFTs completely in place, which means that beside some
-constant buffers, no second data array is necessary. Especially, the global data communication
-can be performed in place. As far as we know, there is no other parallel FFT library beside FFTW and PFFT that
-supports this feature.
-This feature is enabled as soon as the pointer to the output array \code{out} is equal to the pointer to the input array \code{in}.
-E.g., in Listing~\ref{lst:man_c2c} we would call
-\begin{lstlisting}[firstnumber=34]
-/* Plan parallel forward FFT */
-plan = pfft_plan_dft_3d(n, in, in, comm_cart_2d,
-    PFFT_FORWARD, PFFT_TRANSPOSED_NONE);
-\end{lstlisting}
-
-\section{Higher dimensional data decomposition}
-The test program given in Listing~\ref{lst:man_c2c} used a two-dimensional data decomposition of a three-dimensional data set.
-Moreover, PFFT support the computation of any $d$-dimensional FFT with $r$-dimensional data decomposition
-as long as $r\le d-1$. For example, one can use a one-dimensional data decomposition for any two- or higher-dimensional data set,
-while the data set must be at least four-dimensional to fit to a three-dimensional data decomposition.
-The case $r=d$ is not supported efficiently, since during the parallel computations
-there is always at least one dimension that remains local, i.e., one dimensions stays non-decomposed.
-The only exception from this rule is the case $d=r=3$ that is supported by PFFT in a special way, see Section~\ref{sec:3don3d} for details.
-
-The dimensionality of the data decomposition is given by the dimension of the Cartesian communicator that
-goes into the PFFT planing routines. Therefore, we present a generalization of communicator creation function
-\begin{lstlisting}
-int pfft_create_procmesh(
-    int rnk_np, MPI_Comm comm, const int *np,
-    MPI_Comm *comm_cart);
-\end{lstlisting}
-Hereby, the array \code{np} of length \code{rnk_np} gives the size of the Cartesian communicator \code{cart_comm}.
-
-\section{Parallel data decomposition}\label{sec:par-data-decomp}
-In the following, we use the notation $\frac{n}{P}$ to symbolize that an array of length $n$ is broken into disjoint blocks and distributed on $P$ MPI processes.
-Hereby, the data is distributed in compliance to the FFTW-MPI data decompostion~\cite{fftw-mpi-data-distribution},
-i.e., the first \code{P/block} (rounded down) processes get a contiguous chunk of \code{block} elements,
-the next process gets the remaining \code{n - block * (n/block)} data elements, and all remaining processes get nothing.
-Thereby, the block size \code{block} defaults to \code{n/P} (rounded down) but can also be user defined.
-
-\subsection{Non-transposed and transposed data layout}
-In the following, we use the notation $\frac{n}{P}$ to symbolize that an array of length $n$ is distributed on $P$ MPI processes.
-The standard PFFT data decomposition of $h$ interleaved $d$-dimensional arrays of equal size $n_0 \times n_1\times \hdots \times n_{d-1}$
-on a $r$-dimensional process mesh of size $P_0\times \hdots \times P_{r-1}$ is given by the blocks
-\begin{equation*}
-  \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \hdots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \hdots \times n_{d-1} \times h.
-\end{equation*}
-A PFFT created with planning flag \code{PFFT_TRANSPOSED_NONE} requires the inputs to be decomposed in this standard way and produces
-outputs that are decomposed in the same way.
-
-PFFT can save half of the global communication amount, if the data reordering to standard decomposition is omitted. 
-The transposed data decomposition is given by
-\begin{equation*}
-  \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \hdots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \hdots \times n_{d-1} \times h
-\end{equation*}
-A PFFT plan created with planning flag \code{PFFT_TRANSPOSED_OUT} produces outputs with transposed data decomposition.
-Analogously, a PFFT plan created with planning flag \code{PFFT_TRANSPOSED_IN} requires its inputs to be decomposed in the transposed way.
-Typically, one creates a forward plan with \code{PFFT_TRANSPOSED_OUT} and a backward plan with planning flag \code{PFFT_TRANSPOSED_IN}.
-
-Note that the flags \code{PFFT_TRANSPOSED_OUT} and \code{PFFT_TRANSPOSED_IN} must be passed to the array distribution function (see Section~\ref{sec:local-size})
-\emph{as well as} to the planner (see Section~\ref{sec:create-plan}).
-
-
-\subsection{Three-dimensional FFTs with three-dimensional data decomposition}\label{sec:3don3d}
-Many applications work with three-dimensional block decompositions of three-dimensional arrays.
-PFFT supports decompositions of the kind
-\begin{equation*}
-  \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \frac{n_2}{P_2} \times h.
-\end{equation*}
-However, PFFT applies a parallel algorithms that needs at least one non-distributed transform dimension (we do not transform along $h$),
-Therefore, we split the number of processes along the last dimension into two factors $P_2=Q_1Q_2$, remap
-the data to the two-dimensional decomposition
-\begin{equation*}
-  \frac{n_0}{P_0Q_0} \times \frac{n_1}{P_1Q_1} \times n_2 \times h,
-\end{equation*}
+
+::
+
+    void pfft_fprintf(
+        MPI_Comm comm, FILE *stream, const char *format, ...);
+
+to print the error message. This function is similar to the standard C
+function with the exception, that only the process with MPI rank
+:math:`0` within the given communicator will produce some output; see
+Section [sec:fprintf] for details.
+
+Inplace transforms
+------------------
+
+Similar to FFTW, PFFT is able to compute parallel FFTs completely in
+place, which means that beside some constant buffers, no second data
+array is necessary. Especially, the global data communication can be
+performed in place. As far as we know, there is no other parallel FFT
+library beside FFTW and PFFT that supports this feature. This feature is
+enabled as soon as the pointer to the output array is equal to the
+pointer to the input array . E.g., in Listing [lst:man\ :sub:`c`\ 2c] we
+would call
+
+::
+
+    /* Plan parallel forward FFT */
+    plan = pfft_plan_dft_3d(n, in, in, comm_cart_2d,
+        PFFT_FORWARD, PFFT_TRANSPOSED_NONE);
+
+Higher dimensional data decomposition
+-------------------------------------
+
+The test program given in Listing [lst:man\ :sub:`c`\ 2c] used a
+two-dimensional data decomposition of a three-dimensional data set.
+Moreover, PFFT support the computation of any :math:`d`-dimensional FFT
+with :math:`r`-dimensional data decomposition as long as
+:math:`r\le d-1`. For example, one can use a one-dimensional data
+decomposition for any two- or higher-dimensional data set, while the
+data set must be at least four-dimensional to fit to a three-dimensional
+data decomposition. The case :math:`r=d` is not supported efficiently,
+since during the parallel computations there is always at least one
+dimension that remains local, i.e., one dimensions stays non-decomposed.
+The only exception from this rule is the case :math:`d=r=3` that is
+supported by PFFT in a special way, see Section [sec:3don3d] for
+details.
+
+The dimensionality of the data decomposition is given by the dimension
+of the Cartesian communicator that goes into the PFFT planing routines.
+Therefore, we present a generalization of communicator creation function
+
+::
+
+    int pfft_create_procmesh(
+        int rnk_np, MPI_Comm comm, const int *np,
+        MPI_Comm *comm_cart);
+
+Hereby, the array of length gives the size of the Cartesian communicator
+.
+
+Parallel data decomposition
+---------------------------
+
+In the following, we use the notation :math:`\frac{n}{P}` to symbolize
+that an array of length :math:`n` is broken into disjoint blocks and
+distributed on :math:`P` MPI processes. Hereby, the data is distributed
+in compliance to the FFTW-MPI data decompostion , i.e., the first
+(rounded down) processes get a contiguous chunk of elements, the next
+process gets the remaining data elements, and all remaining processes
+get nothing. Thereby, the block size defaults to (rounded down) but can
+also be user defined.
+
+Non-transposed and transposed data layout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the following, we use the notation :math:`\frac{n}{P}` to symbolize
+that an array of length :math:`n` is distributed on :math:`P` MPI
+processes. The standard PFFT data decomposition of :math:`h` interleaved
+:math:`d`-dimensional arrays of equal size
+:math:`n_0 \times n_1\times \hdots \times n_{d-1}` on a
+:math:`r`-dimensional process mesh of size
+:math:`P_0\times \hdots \times P_{r-1}` is given by the blocks
+
+.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \hdots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \hdots \times n_{d-1} \times h.
+
+A PFFT created with planning flag requires the inputs to be decomposed
+in this standard way and produces outputs that are decomposed in the
+same way.
+
+PFFT can save half of the global communication amount, if the data
+reordering to standard decomposition is omitted. The transposed data
+decomposition is given by
+
+.. math:: \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \hdots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \hdots \times n_{d-1} \times h
+
+A PFFT plan created with planning flag produces outputs with transposed
+data decomposition. Analogously, a PFFT plan created with planning flag
+requires its inputs to be decomposed in the transposed way. Typically,
+one creates a forward plan with and a backward plan with planning flag .
+
+Note that the flags and must be passed to the array distribution
+function (see Section [sec:local-size]) *as well as* to the planner (see
+Section [sec:create-plan]).
+
+Three-dimensional FFTs with three-dimensional data decomposition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Many applications work with three-dimensional block decompositions of
+three-dimensional arrays. PFFT supports decompositions of the kind
+
+.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \frac{n_2}{P_2} \times h.
+
+However, PFFT applies a parallel algorithms that needs at least one
+non-distributed transform dimension (we do not transform along
+:math:`h`), Therefore, we split the number of processes along the last
+dimension into two factors :math:`P_2=Q_1Q_2`, remap the data to the
+two-dimensional decomposition
+
+.. math:: \frac{n_0}{P_0Q_0} \times \frac{n_1}{P_1Q_1} \times n_2 \times h,
+
 and compute the parallel FFT with this two-dimensional decomposition.
-Note that the 3d to 2d remap implies some very special restrictions on the block sizes for $n_0$ and $n_1$, i.e.,
-the blocks must be divisible by $Q_0$ and $Q_1$. More precisely, the default blocks of the 2d-decomposition
-are given by \code{n0/(P0*Q0)} and \code{n1/(P1*Q1)} (both divisions rounded down).
-This implies that the default blocks of the 3d-decomposition must be \code{n0/(P0*Q0) * Q0},
-\code{n1/(P1*Q1) * Q1}, and \code{n2/(Q0*Q1)} (all divisions rounded down).
+Note that the 3d to 2d remap implies some very special restrictions on
+the block sizes for :math:`n_0` and :math:`n_1`, i.e., the blocks must
+be divisible by :math:`Q_0` and :math:`Q_1`. More precisely, the default
+blocks of the 2d-decomposition are given by and (both divisions rounded
+down). This implies that the default blocks of the 3d-decomposition must
+be , , and (all divisions rounded down).
 
+Planning effort
+---------------
 
-\section{Planning effort}
 Pass one of the following flags
-\begin{compactitem}
-  \item \code{PFFT_ESTIMATE},
-  \item \code{PFFT_MEASURE},
-  \item \code{PFFT_PATIENT}, or,
-  \item \code{PFFT_EXHAUSIVE}
-\end{compactitem}
-to the PFFT planner in order to plan all internal FFTW plans with \code{FFTW_ESTIMATE}, \code{FFTW_MEASURE}, \code{FFTW_PATIENT}, or \code{FFTW_EXHAUSIVE},
-respectively. The default value is \code{PFFT_MEASURE}.
-
-PFFT uses FFTW plans for parallel array transposition and the serial transforms. In fact, every serial transform is a combination of
-strided lower-dimensional FFTs and a serial array transposition (necessary to prepare the global transposition) which can be done by a single FFTW plan.
-However, it turns out that FFTW sometimes performs better if the serial transposition and the strided FFTs are executed separately.
-Therefore, PFFT introduces the flag \code{PFFT_TUNE} that enables extensive run time tests in order to find the optimal sequence of
-serial strided FFT and serial transposition for every serial transform. These tests are disable on default which corresponds to the flag \code{PFFT_NO_TUNE}.
-
-\section{Preserving input data}
+
+,
+
+,
+
+, or,
+
+to the PFFT planner in order to plan all internal FFTW plans with , , ,
+or , respectively. The default value is .
+
+PFFT uses FFTW plans for parallel array transposition and the serial
+transforms. In fact, every serial transform is a combination of strided
+lower-dimensional FFTs and a serial array transposition (necessary to
+prepare the global transposition) which can be done by a single FFTW
+plan. However, it turns out that FFTW sometimes performs better if the
+serial transposition and the strided FFTs are executed separately.
+Therefore, PFFT introduces the flag that enables extensive run time
+tests in order to find the optimal sequence of serial strided FFT and
+serial transposition for every serial transform. These tests are disable
+on default which corresponds to the flag .
+
+Preserving input data
+---------------------
+
 The following flags
-\begin{compactitem}
-  \item \code{PFFT_PRESERVE_INPUT},
-  \item \code{PFFT_DESTROY_INPUT}, and,
-  \item \code{PFFT_BUFFERED_INPLACE}
-\end{compactitem}
-only take effect for out-of-place transforms.
-The first one behaves analogously to the FFTW flag \code{FFTW_PRESERVE_INPUT} and ensures that the input values are not overwritten.
-In fact, this flag implies that only the first serial transform is executed out-of-place and all
-successive steps are performed in-place on the output array.
-In compliance to FFTW, this is the default behaviour for out-of-place plans.
-
-The second flag behaves analogously to the FFTW flag \code{FFTW_DESTROY_INPUT} and tells the planner that
-the input array can be used as scratch array. This may give some speedup for out-of-place plans,
-because all the intermediate transforms and transposition steps can be performed out-of-place.
-
-Finally, the flag \code{PFFT_BUFFERED_INPLACE} can be used for out-of-place plans that store its inputs and outputs in the same array,
-i.e., array \code{out} is used for intermediate out-of-place transforms and transpositions but the PFFT inputs and outputs are stored in array \code{in}.
-
-
-\section{FFTs with shifted index sets}
-\todo[inline]{Describe shifted input and output}
-\begin{compactitem}
-  \item \code{PFFT_SHIFTED_IN}
-  \item \code{PFFT_SHIFTED_OUT}
-\end{compactitem}
-
-\section{Pruned FFT and Shifted Index Sets}
-\todo[inline]{Describe pruned FFT with shifted input and output}
-\subsection{Pruned FFT}
+
+,
+
+, and,
+
+only take effect for out-of-place transforms. The first one behaves
+analogously to the FFTW flag and ensures that the input values are not
+overwritten. In fact, this flag implies that only the first serial
+transform is executed out-of-place and all successive steps are
+performed in-place on the output array. In compliance to FFTW, this is
+the default behaviour for out-of-place plans.
+
+The second flag behaves analogously to the FFTW flag and tells the
+planner that the input array can be used as scratch array. This may give
+some speedup for out-of-place plans, because all the intermediate
+transforms and transposition steps can be performed out-of-place.
+
+Finally, the flag can be used for out-of-place plans that store its
+inputs and outputs in the same array, i.e., array is used for
+intermediate out-of-place transforms and transpositions but the PFFT
+inputs and outputs are stored in array .
+
+FFTs with shifted index sets
+----------------------------
+
+Pruned FFT and Shifted Index Sets
+---------------------------------
+
+Pruned FFT
+~~~~~~~~~~
+
 For pruned r2r- and c2c-FFT are defined as
-\begin{equation*}
-  g_l = \sum_{k=0}^{n_i-1} \hat g_k \eim{kl/n}, \quad l=0,\hdots,n_o-1,
-\end{equation*}
-where $n_i\le n$ and $n_o\le n$.
 
-\subsection{Shifted Index Sets}
-For $N\in 2\N$ we define the FFT with shifted inputs
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k \eim{kl/n}, \quad l=0,\hdots,n_o-1,
+
+where :math:`n_i\le n` and :math:`n_o\le n`.
+
+Shifted Index Sets
+~~~~~~~~~~~~~~~~~~
+
+For :math:`N\in 2\N` we define the FFT with shifted inputs
+
+For :math:`K,L,N\in 2\N`, :math:`L<N`, :math:`L<N` we define
+
+Precisions
+----------
 
+PFFT handles multiple precisions exactly in the same way as FFTW.
+Therefore, we quote part  of the FFTW manual in the context of PFFT:
 
-For $K,L,N\in 2\N$, $L<N$, $L<N$ we define
+You can install single and long-double precision versions of PFFT, which
+replace double with float and long double, respectively; see
+[sec:install]. To use these interfaces, you must
 
+Link to the single/long-double libraries; on Unix, or instead of (or in
+addition to) . (You can link to the different-precision libraries
+simultaneously.)
 
+Include the same header file.
 
+Replace all lowercase instances of ‘’ with ‘’ or ‘’ for single or
+long-double precision, respectively. ( becomes , becomes , etcetera.)
 
+Uppercase names, i.e. names beginning with ‘’, remain the same.
 
-\section{Precisions}\label{sec:prec}
-PFFT handles multiple precisions exactly in the same way as FFTW. Therefore, we quote part~\cite{fftw-prec} of the FFTW manual in the context of PFFT:
+Replace with or for subroutine parameters.
 
-You can install single and long-double precision versions of PFFT, which replace double with float and long double, respectively; see \ref{sec:install}.
-To use these interfaces, you must
-\begin{compactitem}
-  \item Link to the single/long-double libraries; on Unix, \code{-lpfftf} or \code{-lpfftl} instead of (or in addition to) \code{-lpfft}.
-        (You can link to the different-precision libraries simultaneously.)
-  \item Include the same \code{<pfft.h>} header file.
-  \item Replace all lowercase instances of ‘\code{pfft_}’ with ‘\code{pfftf_}’ or ‘\code{pfftl_}’ for single or long-double precision, respectively.
-        (\code{pfft_complex} becomes \code{pfftf_complex}, \code{pfft_execute} becomes \code{pfftf_execute}, etcetera.)
-  \item Uppercase names, i.e. names beginning with ‘\code{PFFT_}’, remain the same.
-  \item Replace \code{double} with \code{float} or \code{long double} for subroutine parameters.
-\end{compactitem}
+Ghost cell communication
+------------------------
 
-\section{Ghost cell communication}
-\todo[inline]{explain ghost cell communication with a test file}
+Fortran interface
+-----------------
 
-\section{Fortran interface}
-\todo[inline]{explain F03 interface with a test file}

From e17ef6ed37238a427def875bc6dbb8c45bca3d14 Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 13 Sep 2015 04:04:38 -0700
Subject: [PATCH 4/6] Use automated tools to convert most of the text.

Labels need to be fixed.
---
 doc/convert.sh    |  11 +
 doc/develop.rst   |  18 +-
 doc/features.rst  | 135 +++----
 doc/fortran.rst   |   4 +
 doc/index.rst     |  20 +-
 doc/install.rst   |  82 +++--
 doc/interface.rst |  51 +--
 doc/intro.rst     |  13 +-
 doc/preample.tex  | 105 ++++++
 doc/reference.rst | 884 +++++++++++++++++++++++++---------------------
 doc/reference.tex |   2 +-
 doc/shortcuts.tex |   3 +-
 doc/tutorial.rst  | 296 +++++++++-------
 13 files changed, 960 insertions(+), 664 deletions(-)
 create mode 100644 doc/convert.sh
 create mode 100644 doc/preample.tex

diff --git a/doc/convert.sh b/doc/convert.sh
new file mode 100644
index 0000000..242a9eb
--- /dev/null
+++ b/doc/convert.sh
@@ -0,0 +1,11 @@
+files=*.tex
+for i in $files; do
+   # cat $i | sed 's/\\code{\([^}]*\)}/{\\small \1}/g' | 
+    echo $i
+    cat preample.tex $i | sed \
+        -e 's;\hdots;\dots;g' \
+        -e 's/\\code{\([^}]*\)}/\\verb+\1+/g' \
+        | pandoc -f latex -t rst  \
+    > ${i//.tex}.rst
+done
+
diff --git a/doc/develop.rst b/doc/develop.rst
index 60781d0..4d4830d 100644
--- a/doc/develop.rst
+++ b/doc/develop.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Developers Guide
 ================
 
@@ -23,16 +27,18 @@ manual:
 ToDo
 ====
 
--  is defined as
+-  ``PFFT_FORWARD`` is defined as ``FFTW_FORWARD``
 
--  is defined as :math:`-1`
+-  ``FFTW_FORWARD`` is defined as :math:`-1`
 
--  PFFT allows to chose between and , which is not implemented by FFTW.
+-  PFFT allows to chose between ``FFTW_FORWARD`` and ``FFTW_BACKWARD``,
+   which is not implemented by FFTW.
 
--  Matlab uses the same sign convention, i.e., :math:`-1` for and
-   :math:`+1` for
+-  Matlab uses the same sign convention, i.e., :math:`-1` for ``fft``
+   and :math:`+1` for ``ifftn``
 
 Measuring parallel run times
 ----------------------------
 
-Use in front of every call to function to avoid unbalanced run times.
+Use ``MPI_Barrier`` in front of every call to ``pfft_`` function to
+avoid unbalanced run times.
diff --git a/doc/features.rst b/doc/features.rst
index 417abf8..98eb675 100644
--- a/doc/features.rst
+++ b/doc/features.rst
@@ -1,34 +1,38 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Advanced Features
 =================
 
 How to Deal with FFT Index Shifts in Parallel
 ---------------------------------------------
 
-Let :math:`n\in2\N`. A common problem is that the index of the FFT input
-and/or output array runs between
-:math:`-\nicefrac n2,\hdots,\nicefrac n2-1`, but the FFT library
-requires them to run between :math:`0,\hdots,n-1`. With serial program
-execution one can easily remap the input data :math:`\hat g_k` in a way
-that is suitable for the library, i.e.,
+Let :math:`n\in2{\ensuremath{\mathbb{N}}}`. A common problem is that the
+index of the FFT input and/or output array runs between
+:math:`-\nicefrac n2,\dots,\nicefrac n2-1`, but the FFT library requires
+them to run between :math:`0,\dots,n-1`. With serial program execution
+one can easily remap the input data :math:`\hat g_k` in a way that is
+suitable for the library, i.e.,
 
-.. math:: \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\hdots,n-1.
+.. math:: \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\dots,n-1.
 
 Similarly, one could remap the outputs of the library :math:`f_l`,
 :math:`l=0,\cdots,n-1` in the opposite direction in order to get the
 required outputs, i.e.,
 
-.. math:: g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\hdots,\nicefrac n2-1.
+.. math:: g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\dots,\nicefrac n2-1.
 
-These shifts are also known as in Matlab.
+These shifts are also known as ``fftshift`` in Matlab.
 
-However, with distributed memory these operations require more complex
-data movements and result in a global communication. For example, the
-first index of the array moves to the middle and, therefore, the
-corresponding data move to another MPI process. Fortunately, this
-communication can be avoided at the cost of little extra computation. At
-the end of the section we present two PFFT library functions that
-perform the necessary pre- and postprocessing for shifted input and
-output index sets.
+However, with distributed memory these ``fftshift`` operations require
+more complex data movements and result in a global communication. For
+example, the first index of the array moves to the middle and,
+therefore, the corresponding data move to another MPI process.
+Fortunately, this communication can be avoided at the cost of little
+extra computation. At the end of the section we present two PFFT library
+functions that perform the necessary pre- and postprocessing for shifted
+input and output index sets.
 
 Shift with half the FFT size
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -40,9 +44,10 @@ shift the input (``PFFT_SHIFTED_IN``) and/or to shift the output
 
 Here, we are interested in the computation of
 
-.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1
+.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
 
-with :math:`n, n_i, n_o \in 2\N` and :math:`n>n_i`, :math:`n>n_o`.
+with :math:`n, n_i, n_o \in 2{\ensuremath{\mathbb{N}}}` and
+:math:`n>n_i`, :math:`n>n_o`.
 
 With an index shift of :math:`\nicefrac n2` both in :math:`k` and
 :math:`l` this equivalent to the computation of
@@ -52,32 +57,32 @@ With an index shift of :math:`\nicefrac n2` both in :math:`k` and
    \begin{aligned}
      g_{(l-\nicefrac{n}{2})}
      &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-        \hat g_{(k-\nicefrac{n}{2})} \eim{(k-\nicefrac n2)(l-\nicefrac n2)/n} \\
-     &= \e^{+\pi\ti l} 
+        \hat g_{(k-\nicefrac{n}{2})} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k-\nicefrac n2)(l-\nicefrac n2)/n}}} \\
+     &= {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}l} 
           \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-          \left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti (k-\nicefrac n2)}\right) \eim{kl/n} \\
-     &= \e^{+\pi\ti(l-\nicefrac n2)} 
+          \left(\hat g_{(k-\nicefrac{n}{2})}{{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(k-\nicefrac n2)}\right) {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}} \\
+     &= {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(l-\nicefrac n2)} 
         \underbrace{
           \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}\e^{+\pi\ti k}\right)}_{\hat f_k} \eim{kl/n}
+          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}{{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}k}\right)}_{\hat f_k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}
         }_{f_l}\end{aligned}
 
 for
-:math:` l=\nicefrac n2-\nicefrac{n_o}{2},\hdots,\nicefrac n2 +\nicefrac{n_o}{2}-1`.
+:math:` l=\nicefrac n2-\nicefrac{n_o}{2},\dots,\nicefrac n2 +\nicefrac{n_o}{2}-1`.
 Therefore, we get the following algorithm
 
-.. math:: f_l = \sum_{k=0}^n \hat g_k \eim{kl/n}, \quad l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1
+.. math:: f_l = \sum_{k=0}^n \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
 
 The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
 corresponds to the shifts the arrays ()
 
-[1] =1.1ex For :math:`k=0,\hdots,n-1` set :math:`\hat f_k = 0`. For
-:math:`k=-\nicefrac{n_i}{2},\hdots,\nicefrac{n_i}{2}-1` compute
+[1] =1.1ex For :math:`k=0,\dots,n-1` set :math:`\hat f_k = 0`. For
+:math:`k=-\nicefrac{n_i}{2},\dots,\nicefrac{n_i}{2}-1` compute
 :math:`\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}`.
-For :math:`l=0,\hdots,n-1` compute
-:math:`f_l = \sum_{k=0}^{n} \hat f_k \eim{kl/n}` using PFFT. For
-:math:`l=-\nicefrac{n_o}{2},\hdots,\nicefrac{n_o}{2}-1` compute
-:math:`g_l = (-1)^l f_{(l+n/2)} `.
+For :math:`l=0,\dots,n-1` compute
+:math:`f_l = \sum_{k=0}^{n} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+using PFFT. For :math:`l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1`
+compute :math:`g_l = (-1)^l f_{(l+n/2)} `.
 
 Note, that this shift implies that the library deals with pruned FFTs in
 a special way, i.e., half of the zeros are added at the beginning of the
@@ -89,23 +94,25 @@ Arbitrary shifts
 More general shifts must be done by the user.
 
 In a more general setting, we are interested in the computation of FFTs
-with shifted index sets, i.e., assume :math:`k_s,l_s\in\Z` and compute
+with shifted index sets, i.e., assume
+:math:`k_s,l_s\in{\ensuremath{\mathbb{Z}}}` and compute
 
 .. math::
 
-   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{kl/n},
-     \quad l=l_s,\hdots,n_o+l_s-1\,.
+   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}},
+     \quad l=l_s,\dots,n_o+l_s-1\,.
 
 Because of the periodicity of the FFT this can be easily performed by
- [alg:fftshift:sub:`t`\ ranslation].
+Alg. [alg:fftshift:sub:`t`\ ranslation].
 
 [alg:fftshift:sub:`t`\ ranslation]
 
-[1] =1.1ex For :math:`k=0,\hdots,n_i-1` assign
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` assign
 :math:`\hat f_k = \hat g_{(k+k_s\bmod n_i)}`. For
-:math:`l=0,\hdots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
-:math:`l=0,\hdots,n_o-1` assign :math:`g_l = f_{(l-l_s\bmod n_o)}`.
+:math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` assign
+:math:`g_l = f_{(l-l_s\bmod n_o)}`.
 
 However, this involves explicit data movement since the sequence of data
 changes. For a our parallel data decomposition the change of data layout
@@ -117,45 +124,46 @@ computation of
    \begin{aligned}
      g_{l+l_s}
      &=
-       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k \eim{k(l+l_s)/n}
+       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k(l+l_s)/n}}}
        =
-       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} \eim{(k+k_s)(l+l_s)/n} \\
+       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)(l+l_s)/n}}} \\
      &=
-       \eim{k_sl/n} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}\eim{(k+k_s)l_s/n}\right)}_{=: \hat f_k} \eim{kl/n}\end{aligned}
+       {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k_sl/n}}} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}{\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)l_s/n}}}\right)}_{=: \hat f_k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}\end{aligned}
 
-for all :math:`l=0,\hdots,n_o-1`. The resulting
- [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at the
-price of some extra computation.
+for all :math:`l=0,\dots,n_o-1`. The resulting
+Alg. [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at
+the price of some extra computation.
 
 [alg:fftshift:sub:`m`\ odulation]
 
-[1] =1.1ex For :math:`k=0,\hdots,n_i-1` compute
-:math:`\hat f_k = \hat g_{(k+k_s)} \eim{(k+k_s)l_s/n}`. For
-:math:`l=0,\hdots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
-:math:`l=0,\hdots,n_o-1` compute :math:`g_{(l+l_s)} = f_l \eim{k_sl/n}`.
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k+k_s)} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)l_s/n}}}`.
+For :math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` compute
+:math:`g_{(l+l_s)} = f_l {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k_sl/n}}}`.
 
 The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
 corresponds to the shifts the arrays ()
 
-[1] =1.1ex For :math:`k=0,\hdots,n_i-1` compute
-:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} \e^{+\pi\ti (k-\nicefrac{n_i}{2})n_o/n}`.
-For :math:`l=0,\hdots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k \eim{kl/n}` using PFFT. For
-:math:`l=0,\hdots,n_o-1` compute
-:math:`g_{(l-\nicefrac{n_o}{2})} = f_l \e^{+\pi\ti n_i l/n}`.
+[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
+:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(k-\nicefrac{n_i}{2})n_o/n}`.
+For :math:`l=0,\dots,n_o-1` compute
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+using PFFT. For :math:`l=0,\dots,n_o-1` compute
+:math:`g_{(l-\nicefrac{n_o}{2})} = f_l {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}n_i l/n}`.
 
 Parallel pruned FFT
 -------------------
 
 Within PFFT we define a pruned FFT as
 
-.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} \eim{kl/n}, \quad l=0,\hdots,n_o-1.
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n_o-1.
 
 Formally, this is equivallent to the following regular size :math:`n`
 FFT
 
-.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} \eim{kl/n}, \quad l=0,\hdots,n,
+.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n,
 
 with
 
@@ -163,12 +171,13 @@ with
 
    \hat g_k := 
      \begin{cases}
-     \hat f_k, &: k=0,\hdots,n_1-1, \\
-     0         &: k=n_i,\hdots,n-1,    
+     \hat f_k, &: k=0,\dots,n_1-1, \\
+     0         &: k=n_i,\dots,n-1,    
      \end{cases}
 
-and :math:`f_l := g_l`, :math:`k=0,\hdots,n_o-1`. I.e., we add
+and :math:`f_l := g_l`, :math:`k=0,\dots,n_o-1`. I.e., we add
 :math:`n-n_i` zeros at the end of the input array and throw away
 :math:`n-n_o` entries at the end of the output array.
 
-The definition of pruned FFT changes for and .
+The definition of pruned FFT changes for ``PFFT_SHIFTED_IN`` and
+``PFFT_SHIFTED_OUT``.
diff --git a/doc/fortran.rst b/doc/fortran.rst
index fad7927..864aa83 100644
--- a/doc/fortran.rst
+++ b/doc/fortran.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Fotran Interface
 ================
 
diff --git a/doc/index.rst b/doc/index.rst
index 4a5cbab..2e0d8e2 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -11,20 +11,20 @@ Contents:
 .. toctree::
    :maxdepth: 2
 
-  intro
-  tutorial
-  install
-  features
-  interface
-  reference
-  develop
+   intro
+   tutorial
+   install
+   features
+   interface
+   reference
+   develop
 
 
 
 Indices and tables
 ==================
 
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+* ::ref:`genindex`
+* ::ref:`modindex`
+* ::ref:`search`
 
diff --git a/doc/install.rst b/doc/install.rst
index 87246e5..7eb1370 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Installation and linking
 ========================
 
@@ -13,9 +17,10 @@ workflow
 Install of the latest official FFTW release
 -------------------------------------------
 
-PFFT depends on Release  of the FFTW library . For the sake of
+PFFT depends on Release 3.3.3 of the FFTW library . For the sake of
 completeness, we show the command line based install procedure in the
-following. However, note that we provide install scripts on that
+following. However, note that we provide install scripts on
+`{www.tu-chemnitz.de/~mpip}/software.php <{www.tu-chemnitz.de/~mpip}/software.php>`__that
 simplify the install a lot. We highly recommend to use these install
 scripts, since they additionally apply several performance patches and
 bugfixes that have been submitted to the FFTW developers but are not yet
@@ -31,14 +36,15 @@ included in the official FFTW releases.
     make install
 
 The MPI algorithms of FFTW must be build with a MPI C compiler. Add the
-statement at the end of line [lst:fftw:conf] if the script fails to
-determine the right MPI C compiler . Similarly, the MPI Fortran compiler
-is set by .
+statement ``MPICC=\$MPICCOMP`` at the end of line [lst:fftw:conf] if the
+``configure`` script fails to determine the right MPI C compiler
+``\$MPICCOMP``. Similarly, the MPI Fortran compiler ``\$MPIFCOMP`` is
+set by ``MPIFC=\$MPIFCOMP``.
 
 Install of the PFFT library
 ---------------------------
 
-In the simplest case, the hardware platform and the - library are
+In the simplest case, the hardware platform and the -3.3.3 library are
 recognized by the PFFT configure script automatically, so all we have to
 do is
 
@@ -52,18 +58,19 @@ do is
     make check
     make install
 
-Hereby, the optional call builds the test programs. If the - software
-library is already installed on your system but not found by the
-configure script, you can provide the FFTW installation directory to
-configure by
+Hereby, the optional call ``make check`` builds the test programs. If
+the -3.3.3 software library is already installed on your system but not
+found by the configure script, you can provide the FFTW installation
+directory ``\$FFTWDIR`` to configure by
 
 .. code:: bash
 
     ./configure --with-fftw3=$FFTWDIR
 
-This call implies that the FFTW header files are located in and the FFTW
-library files are located in . Otherwise, one should specify the FFTW
-include path and the FFTW library path separately by
+This call implies that the FFTW header files are located in
+``\$FFTWDIR/include`` and the FFTW library files are located in
+``\$FFTWDIR/lib``. Otherwise, one should specify the FFTW include path
+``\$FFTWINC`` and the FFTW library path ``\$FFTWLIB`` separately by
 
 ::
 
@@ -76,30 +83,34 @@ At the end, this is equivalent to
     ./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB
 
 which is more common to experienced users of the Autotools. To install
-PFFT in a user specified directory call configure with the option
+PFFT in a user specified directory ``\$PFFTINSTDIR`` call configure with
+the option
 
 ::
 
     ./configure --prefix=$PFFTINSTDIR
 
 However, this option is mandatory whenever you do not have root
-permissions on your machine, since the default install paths of are not
-accessible by standard users. The PFFT library must be built with a MPI
-compiler. In Section [sec:fftw\ :sub:`i`\ nst] we already described how
-to hand the right compilers to the script. Some more options are
+permissions on your machine, since the default install paths of
+``configure`` are not accessible by standard users. The PFFT library
+must be built with a MPI compiler. In Section [sec:fftw\ :sub:`i`\ nst]
+we already described how to hand the right compilers to the
+``configure`` script. Some more options are
 
-: Produces a single-precision version of PFFT (float) instead of the
-default double-precision (double); see [sec:prec].
+:code:\`[\`keywords=]–enable-float: Produces a single-precision version
+of PFFT (float) instead of the default double-precision (double); see
+[sec:prec].
 
-: Produces a long-double precision version of PFFT (long double) instead
-of the default double-precision (double); see [sec:prec].
+:code:\`[\`keywords=]–enable-long-double: Produces a long-double
+precision version of PFFT (long double) instead of the default
+double-precision (double); see [sec:prec].
 
-: Disables inclusion of Fortran wrapper routines in the standard PFFT
-libraries.
+``--disable-fortran``: Disables inclusion of Fortran wrapper routines in
+the standard PFFT libraries.
 
-: Disables build of test programs.
+``--disable-tests``: Disables build of test programs.
 
-For more details on the options of the script call
+For more details on the options of the ``configure`` script call
 
 ::
 
@@ -114,19 +125,22 @@ All programs using PFFT should include its header file
 
     #include <pfft.h>
 
-This header includes the FFTW headers , automatically. Make sure that
-the compiler can find them by setting the include flags appropriately.
-You must also link to the PFFT, FFTW and FFTW-MPI libraries. On Unix,
-this means adding at the end of the link command. For example, to build
-use the following compiler invocation
+This header includes the FFTW headers ``fftw.h``, ``fftw-mpi.h``
+automatically. Make sure that the compiler can find them by setting the
+include flags appropriately. You must also link to the PFFT, FFTW and
+FFTW-MPI libraries. On Unix, this means adding
+``-lpfft -lfftw3_mpi -lfftw3 -lm`` at the end of the link command. For
+example, to build ``pfft_test.c`` use the following compiler invocation
 
 ::
 
     mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm
 
-Substitute by any other MPI C compiler if you like. , , , and denote the
-PFFT and FFTW include and library paths, respectively. If you use the
-install scripts mentioned in Sect. [sec:pfft-inst], these paths will be
+Substitute ``mpicc`` by any other MPI C compiler if you like.
+``\$PFFTINC``, ``\$FFTWINC``, ``\$PFFTLIB``, and ``\$FFTWLIB`` denote
+the PFFT and FFTW include and library paths, respectively. If you use
+the install scripts mentioned in Sect. [sec:pfft-inst], these paths will
+be
 
 ::
 
diff --git a/doc/interface.rst b/doc/interface.rst
index ce3cf84..ff25f73 100644
--- a/doc/interface.rst
+++ b/doc/interface.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Interface Layers of the PFFT Library
 ====================================
 
@@ -9,8 +13,8 @@ list of all PFFT functions is given in Chapter [chap:ref].
 Basic Interface
 ---------------
 
-The interface is the simplest interface layer. It is suitable for the
-planning of three-dimensional FFTs.
+The ``_3d`` interface is the simplest interface layer. It is suitable
+for the planning of three-dimensional FFTs.
 
 ::
 
@@ -28,10 +32,11 @@ planning of three-dimensional FFTs.
         pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
         int sign, unsigned pfft_flags);
 
-Hereby, , , , , and are arrays of length .
+Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and
+``local_o_start`` are ``ptrdiff_t`` arrays of length ``3``.
 
-The basic interface generalizes the interface to FFTs of arbitrary
-dimension .
+The basic interface generalizes the ``_3d`` interface to FFTs of
+arbitrary dimension ``rnk_n``.
 
 ::
 
@@ -50,16 +55,18 @@ dimension .
         pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
         int sign, unsigned pfft_flags);
 
-Therefore, , , , , and become arrays of length .
+Therefore, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and
+``local_o_start`` become arrays of length ``rnk_n``.
 
 Advanced Interface
 ------------------
 
-The advanced interface introduces the arrays and of length that give the
-pruned FFT input and output size. Furthermore, the arrays and of length
-( being the dimension of the process mesh) serve to adjust the block
-size of the input and output block decomposition. The additional
-parameter gives the number of transforms that will be computed
+The advanced interface introduces the arrays ``ni`` and ``no`` of length
+``rnk_n`` that give the pruned FFT input and output size. Furthermore,
+the arrays ``iblock`` and ``oblock`` of length ``rnk_pm`` (``rnk_pm``
+being the dimension of the process mesh) serve to adjust the block size
+of the input and output block decomposition. The additional parameter
+``howmany`` gives the number of transforms that will be computed
 simultaneously.
 
 ::
@@ -87,8 +94,8 @@ simultaneously.
 Preliminary: Skip Serial Transformations
 ----------------------------------------
 
-The interface extends the interface by adding the possibility to skip
-some of the serial FFTs.
+The ``_skipped`` interface extends the ``_many`` interface by adding the
+possibility to skip some of the serial FFTs.
 
 ::
 
@@ -100,13 +107,15 @@ some of the serial FFTs.
         pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart,
         int sign, unsigned pfft_flags);
 
-Hereby, is an array of length ( being the mesh dimension of the
-communicator ). For set if the -th serial transformation should be
-computed, otherwise set . Note that the local transpositions are always
-performed, since they are a prerequisite for the global communication to
-work. At the moment it is only possible to skip the whole serial
-transform along the last dimensions. However, this behaviour can be
-realized by a call of a -dimensional PFFT with
+Hereby, ``skip_trafos`` is an ``int`` array of length ``rnk_pm``\ 1+
+(``rnk_pm`` being the mesh dimension of the communicator ``comm_cart``).
+For ``t=0,...,rnk_pm`` set ``skip_trafos[t]=1`` if the ``t``-th serial
+transformation should be computed, otherwise set ``skip_trafos[t]=0``.
+Note that the local transpositions are always performed, since they are
+a prerequisite for the global communication to work. At the moment it is
+only possible to skip the whole serial transform along the last
+``rnk_n-rnk_pm-1`` dimensions. However, this behaviour can be realized
+by a call of a ``(rnk_pm``\ 1)+-dimensional PFFT with
 
 ::
 
@@ -114,4 +123,4 @@ realized by a call of a -dimensional PFFT with
       howmany *= n[t];
 
 and manual computation of the desired serial transforms along the last
-dimensions.
+``rnk_n-rnk_pm-1`` dimensions.
diff --git a/doc/intro.rst b/doc/intro.rst
index 45100a1..9f03204 100644
--- a/doc/intro.rst
+++ b/doc/intro.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Introduction
 ============
 
@@ -24,8 +28,8 @@ and the underlying algorithms of the PFFT library.
 The interface of PFFT is as close as possible to the FFTW-MPI interface.
 In fact, we consider every difference between PFFT and FFTW that is not
 explicitly mentioned within this manual as a bug that should be reported
-to . Therefore, porting code that uses FFTW-MPI to PFFT is almost
-trivial, e.g. see Section [sec:porting].
+to https://github.com/mpip/pfft.git. Therefore, porting code that uses
+FFTW-MPI to PFFT is almost trivial, e.g. see Section [sec:porting].
 
 Most features of PFFT are inherited from FFTW or similarily implemented.
 These include the following:
@@ -131,7 +135,8 @@ transposition routines.
 
 PFFT does not support GPU parallelization.
 
-You are welcome to propose new PFFT features at .
+You are welcome to propose new PFFT features at
+https://github.com/mpip/pfft.git.
 
 Alternative parallel FFT implementations
 ----------------------------------------
@@ -150,4 +155,4 @@ Parallel nonequispaced FFT
 If your are interested in a parallel implementation of nonequispaced
 fast Fourier transforms (NFFT) for distributed memory architectures, you
 should have a look at our PNFFT software library  that is also available
-at .
+at https://github.com/mpip/pnfft.git.
diff --git a/doc/preample.tex b/doc/preample.tex
new file mode 100644
index 0000000..a5d548b
--- /dev/null
+++ b/doc/preample.tex
@@ -0,0 +1,105 @@
+
+\usepackage[english]{babel} % wordbreaks
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage[fixlanguage]{babelbib} % easily change bib language
+
+%% Vector fonts for PDF
+%% \usepackage{ae} % deprecated package, use lmodern instead
+\usepackage{lmodern} %standard latex font
+
+%\usepackage{makeidx} % automatic index generation, required for nomencl.sty
+%\usepackage{nomencl} % important symbols in a table at the beginning of document
+
+%% AMSMath-packages
+\usepackage{amsmath}
+\usepackage{amsthm}
+\usepackage{amssymb}
+% \usepackage{amsrefs}
+% \usepackage{textcmds}
+\usepackage{exscale} % Correct font scaling in formulas
+
+% \usepackage{subfig}
+% \usepackage{graphicx} % Graphics for figures
+\usepackage[svgnames, table, hyperref]{xcolor} %
+\usepackage{paralist} % compact itemize, enumerate, ...
+\usepackage{listings} % source code in LaTeX
+\usepackage{multirow} % combine multiple rows in arrays
+\usepackage{rotating}
+
+%% improvements of LaTeX enviroments
+\usepackage{scrhack} % avoid warning of scrreprt when loading float package
+% \usepackage{float}
+% \usepackage{verbatim}
+% \usepackage{array}
+
+% \usepackage{url} % provides \url command for bibtex
+\usepackage{hyperref} % provides \url command for bibtex and links to jump within documents
+\hypersetup{plainpages=false, colorlinks, linkcolor=black, citecolor=black, urlcolor=blue,
+pdftitle={PFFT User Manual},
+pdfauthor={Michael Pippig}, pdfstartview={FitBH}}
+
+%% adjust numbering
+\numberwithin{figure}{chapter}
+\numberwithin{table}{chapter}
+\numberwithin{equation}{chapter}
+
+%% activate for algorithms
+\usepackage{algpseudocode}
+\usepackage[chapter]{algorithm}
+\usepackage{algorithmicx}
+% \floatname{algorithm}{Algorithmus} % use german title for algorithms
+% \numberwithin{algorithm}{chapter}
+
+\usepackage{xspace}
+\usepackage{nicefrac}
+
+\usepackage{todonotes}
+
+%% activate for compact page layout
+% \usepackage{geometry}
+% \geometry{top=30.4mm, left=30.4mm, text={155mm,240mm}, headheight=10mm, headsep=5mm, includemp, marginparwidth=15.4mm}
+
+%% activate for headline with chapter information on every page
+\usepackage{scrpage2}
+\pagestyle{scrheadings}
+\automark{chapter}
+\clearscrheadings
+\lehead{\pagemark}
+\rehead{\leftmark}
+\rohead{\pagemark}
+\lohead{\rightmark}
+\ofoot[]{}
+\cfoot[]{}
+\ifoot[]{}
+
+
+\renewcommand*{\thefootnote}{\fnsymbol{footnote}}
+
+%% the quotchap document style redeﬁnes the \chapter and \chapter* commands to
+%% create fancy chapter head pages with huge chapter numbers (possibly greyed) and
+%% provides commands for adding quotations in the upper left corner of these pages.
+% \usepackage[grey]{quotchap}
+
+
+%% very special purpose packages
+% \usepackage{faktor} % provides a symbol for factor groups
+% \usepackage{slashbox} % diagonaly divide an array field
+
+
+%% experimental
+% \usepackage[color]{showkeys} % show all reference keys
+% \definecolor{refkey}{gray}{.75}
+% \definecolor{labelkey}{gray}{0.75}
+% \usepackage{epstopdf} % include .eps files with pdflatex
+% \usepackage{marginnote}
+%\usepackage{pgf}
+%\usepackage{jkpgf}
+%\usepackage{pstricks}
+%% make pdf-indexfile for inverse search - needs compatible pdf viewer
+% \synctex=1
+% \usepackage{pdfsync} % deprecated package, use synctex instead
+
+\hyphenation{equi-spaced non-equi-spaced}
+
+\input{shortcuts.tex} % Shortcuts for math symbols
diff --git a/doc/reference.rst b/doc/reference.rst
index 3e9537d..0933d542 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 PFFT Reference
 ==============
 
@@ -11,22 +15,25 @@ You must include the PFFT header file by
     #include <pfft.h>
 
 in the preamble of each source file that calls PFFT. This header
-automatically includes and . Therefore, PFFT can use the data type
-defined in , see . Note that is defined to be the C99 native complex
-whenever is included *before* , and . Otherwise it is defined as
+automatically includes ``fftw.h`` and ``fftw3-mpi.h``. Therefore, PFFT
+can use the ``fftw_complex`` data type defined in ``fftw.h``, see . Note
+that ``fftw_complex`` is defined to be the C99 native complex whenever
+``<complex.h>`` is included *before* ``<fftw.h>``, ``<fftw-mpi.h>`` and
+``<pfft.h>``. Otherwise it is defined as
 
 ::
 
     typedef double fftw_complex[2];
 
-For the sake of a clean namespace we define the wrapper data type as
+For the sake of a clean namespace we define the wrapper data type
+``pfft_complex`` as
 
 ::
 
     typedef fftw_complex pfft_complex;
 
-that can be used equivallently to . Futhermore, we define the wrapper
-functions
+that can be used equivallently to ``fftw_complex``. Futhermore, we
+define the wrapper functions
 
 ::
 
@@ -36,16 +43,18 @@ functions
     void pfft_free(void *p);
 
 as substitues for their corresponding FFTW equivalents, see . Note that
-memory allocated by one of these functions must be freed with (or its
-equivalent ). Because of the performance reasons given in  we recommend
-to use one of the (or its equivalent ) allocation functions for all
-arrays containing FFT inputs and outputs. However, PFFT will also work
+memory allocated by one of these functions must be freed with
+``pfft_free`` (or its equivalent ``fftw_free``). Because of the
+performance reasons given in  we recommend to use one of the ``pfft_``
+(or its equivalent ``fftw_``) allocation functions for all arrays
+containing FFT inputs and outputs. However, PFFT will also work
 (possibly slower) with any other memory allocation method.
 
-Different precisions are handled as in FFTW: That is functions and
-datatypes become (single precision) or (long double precision) prefixed.
-Quadruple precision is not yet supported. The main problem is that we do
-not know about a suitable MPI datatype to represent .
+Different precisions are handled as in FFTW: That is ``pfft_`` functions
+and datatypes become ``pfftf_`` (single precision) or ``pfftl_`` (long
+double precision) prefixed. Quadruple precision is not yet supported.
+The main problem is that we do not know about a suitable MPI datatype to
+represent ``__float128``.
 
 MPI Initialization
 ------------------
@@ -59,7 +68,8 @@ wrapper functions
     void pfft_init(void);
     void pfft_cleanup(void);
 
-that can be used as substitutes for and , respectively.
+that can be used as substitutes for ``fftw_mpi_init`` and
+``fftw_mpi_cleanup``, respectively.
 
 Using PFFT Plans
 ----------------
@@ -77,7 +87,8 @@ and freed with
 
     void pfft_destroy_plan(const pfft_plan plan);
 
-Note, that you can *not* apply or on PFFT plans.
+Note, that you can *not* apply ``fftw_mpi_execute`` or ``fftw_destroy``
+on PFFT plans.
 
 The new array execute functions are given by
 
@@ -88,9 +99,9 @@ The new array execute functions are given by
     void pfft_execute_dft_c2r(const pfft_plan plan, pfft_complex *in, double *out);
     void pfft_execute_r2r(const pfft_plan plan, double *in, double *out);
 
-The arrays given by and must have the correct size and the same
-alignement as the array that were used to create the plan, just as it is
-the case for FFTW, see [fftw-new-array].
+The arrays given by ``in`` and ``out`` must have the correct size and
+the same alignement as the array that were used to create the plan, just
+as it is the case for FFTW, see [fftw-new-array].
 
 Data Distribution Functions
 ---------------------------
@@ -123,64 +134,75 @@ parallel transform.
 
 Arguments:
 
-is the rank of the transform (typically the size of the arrays , , )
-that can be any integer :math:`\ge 2`. The planner corresponds to a of
-3.
-
-The array of size specifies the transform dimensions. They can be any
-positive integer.
-
-The array of size specifies the input array dimensions. They can be any
-positive integer with for all dimensions . For the inputs will be padded
-with zeros up to size along the -th dimension before the transform, see
-Section [sec:pruned-fft].
-
-The array of size specifies the output array dimensions. They can be any
-positive integer with for all dimensions . For the outputs will be
-pruned to size along the -th dimension after the transform, see
-Section [sec:pruned-fft].
-
-is the number of transforms to compute. The resulting plan computes
-howmany transforms, where the input of the k-th transform is at location
-in+k (in C pointer arithmetic) with stride , and its output is at
-location out+k with stride . The basic interface corresponds to
-howmany=1.
-
-is a Cartesian communicator of dimension that specifies the parallel
-data decomposition, see Section [sec:data-decomp]. Most of the time,
-PFFT requires . The only exception is the case , see
-Section [sec:3don3d]. If an ordinary (i.e. non-Cartesian) communicator
-is passed, PFFT internally converts it into a one-dimensional Cartesian
-communicator while retaining the MPI ranks (this results in the FFTW-MPI
-data decomposition).
-
-The arrays and of size specify the block sizes for the first dimensions
-of the input and output data, respectively. These must be the same block
-sizes as were passed to the corresponding function. You can pass to use
-PFFT’s default block sizes. Furthermore, you can use to set the default
-block size in separate dimensions, e.g., .
-
-is a bitwise OR (’’) of zero or more planner flags, as defined in
-Section [sec:flags].
-
-The array of size returns the size of the local input array block in
-every dimension (counted in units of complex numbers).
-
-The array of size returns the offset of the local input array block in
-every dimension (counted in units of complex numbers).
-
-The array of size returns the size of the local output array block in
-every dimension (counted in units of complex numbers).
-
-The array of size returns the offset of the local output array block in
-every dimension (counted in units of complex numbers).
-
-In addition, the following functions compute the local data distribution
-of the process with MPI rank . The interface can be understood as a call
-of where is given by , i.e., each MPI process computes its own data
-block. However, functions have a return type, i.e., they omit the
-computation of the local array size that is necessary to hold the
-parallel transform. This makes functions substantially faster in
+``rnk_n`` is the rank of the transform (typically the size of the arrays
+``n``, ``ni``, ``no``) that can be any integer :math:`\ge 2`. The
+``_3d`` planner corresponds to a ``rnk_n`` of 3.
+
+The array ``n`` of size ``rnk_n`` specifies the transform dimensions.
+They can be any positive integer.
+
+The array ``ni`` of size ``rnk_n`` specifies the input array dimensions.
+They can be any positive integer with ``ni[t] <= n[t]`` for all
+dimensions ``t=0,...,rnk_n-1``. For ``ni[t]<n[t]`` the inputs will be
+padded with zeros up to size ``n[t]`` along the ``t``-th dimension
+before the transform, see Section [sec:pruned-fft].
+
+The array ``no`` of size ``rnk_n`` specifies the output array
+dimensions. They can be any positive integer with ``no[t] <= n[t]`` for
+all dimensions ``t=0,...,rnk_n-1``. For ``no[t]<n[t]`` the outputs will
+be pruned to size ``no[t]`` along the ``t``-th dimension after the
+transform, see Section [sec:pruned-fft].
+
+``howmany`` is the number of transforms to compute. The resulting plan
+computes howmany transforms, where the input of the k-th transform is at
+location in+k (in C pointer arithmetic) with stride ``howmany``, and its
+output is at location out+k with stride ``howmany``. The basic
+``pfft_plan_dft`` interface corresponds to howmany=1.
+
+``comm_cart`` is a Cartesian communicator of dimension ``rnk_pm`` that
+specifies the parallel data decomposition, see
+Section [sec:data-decomp]. Most of the time, PFFT requires
+``rnk_pm < rnk_n``. The only exception is the case
+``rnk_pm == rnk_n == 3``, see Section [sec:3don3d]. If an ordinary (i.e.
+non-Cartesian) communicator is passed, PFFT internally converts it into
+a one-dimensional Cartesian communicator while retaining the MPI ranks
+(this results in the FFTW-MPI data decomposition).
+
+The arrays ``iblock`` and ``oblock`` of size ``rnk_pm``\ 1+ specify the
+block sizes for the first ``rnk_pm``\ 1+ dimensions of the input and
+output data, respectively. These must be the same block sizes as were
+passed to the corresponding ``local_size`` function. You can pass
+``PFFT_DEFAULT_BLOCKS`` to use PFFT’s default block sizes. Furthermore,
+you can use ``PFFT_DEFAULT_BLOCK`` to set the default block size in
+separate dimensions, e.g., ``iblock[t]=PFFT_DEFAULT_BLOCK``.
+
+``pfft_flags`` is a bitwise OR (’``|``\ ’) of zero or more planner
+flags, as defined in Section [sec:flags].
+
+The array ``local_ni`` of size ``rnk_n`` returns the size of the local
+input array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_i_start`` of size ``rnk_n`` returns the offset of the
+local input array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_no`` of size ``rnk_n`` returns the size of the local
+output array block in every dimension (counted in units of complex
+numbers).
+
+The array ``local_o_start`` of size ``rnk_n`` returns the offset of the
+local output array block in every dimension (counted in units of complex
+numbers).
+
+In addition, the following ``local_block`` functions compute the local
+data distribution of the process with MPI rank ``pid``. The
+``local_size`` interface can be understood as a call of ``local_block``
+where ``pid`` is given by ``MPI_Comm_rank(comm_cart, &pid)``, i.e., each
+MPI process computes its own data block. However, ``local_block``
+functions have a ``void`` return type, i.e., they omit the computation
+of the local array size that is necessary to hold the parallel
+transform. This makes ``local_block`` functions substantially faster in
 exectuion.
 
 ::
@@ -230,19 +252,20 @@ parallel transform.
 Arguments are the same as for c2c transforms (see
 Section [sec:local-size-c2c]) with the following exceptions:
 
-The logical input array size will differ from the physical array size of
-the real inputs if the flag is included in . This results from the
-padding at the end of the last dimension that is necessary to align the
-real valued inputs and complex valued outputs for inplace transforms,
-see . In contrast to FFTW-MPI, PFFT does not pad the r2c inputs per
-default.
+The logical input array size ``ni`` will differ from the physical array
+size of the real inputs if the flag ``PFFT_PADDED_R2C`` is included in
+``pfft_flags``. This results from the padding at the end of the last
+dimension that is necessary to align the real valued inputs and complex
+valued outputs for inplace transforms, see . In contrast to FFTW-MPI,
+PFFT does not pad the r2c inputs per default.
 
-is counted in units of real numbers. It will include padding
+``local_ni`` is counted in units of real numbers. It will include
+padding
 
-is counted in units of real numbers.
+``local_i_start`` is counted in units of real numbers.
 
-The corresponding functions compute the local data distribution of the
-process with MPI rank .
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
 
 ::
 
@@ -291,19 +314,19 @@ parallel transform.
 Arguments are the same as for c2c transforms (see
 Section [sec:local-size-c2c]) with the following exceptions:
 
-The logical output array size will differ from the physical array size
-of the real outputs if the flag is included in . This results from the
-padding at the end of the last dimension that is necessary to align the
-real valued outputs and complex valued inputs for inplace transforms,
-see . In contrast to FFTW-MPI, PFFT does not pad the c2r outputs per
-default.
+The logical output array size ``no`` will differ from the physical array
+size of the real outputs if the flag ``PFFT_PADDED_C2R`` is included in
+``pfft_flags``. This results from the padding at the end of the last
+dimension that is necessary to align the real valued outputs and complex
+valued inputs for inplace transforms, see . In contrast to FFTW-MPI,
+PFFT does not pad the c2r outputs per default.
 
-is counted in units of real numbers.
+``local_no`` is counted in units of real numbers.
 
-is counted in units of real numbers.
+``local_o_start`` is counted in units of real numbers.
 
-The corresponding functions compute the local data distribution of the
-process with MPI rank .
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
 
 ::
 
@@ -352,16 +375,16 @@ transform.
 Arguments are the same as for c2c transforms (see
 Section [sec:local-size-c2c]) with the following exceptions:
 
-is counted in units of real numbers.
+``local_ni`` is counted in units of real numbers.
 
-is counted in units of real numbers.
+``local_i_start`` is counted in units of real numbers.
 
-is counted in units of real numbers.
+``local_no`` is counted in units of real numbers.
 
-is counted in units of real numbers.
+``local_o_start`` is counted in units of real numbers.
 
-The corresponding functions compute the local data distribution of the
-process with MPI rank .
+The corresponding ``local_block`` functions compute the local data
+distribution of the process with MPI rank ``pid``.
 
 ::
 
@@ -407,33 +430,38 @@ Complex-to-Complex FFT
         int sign, unsigned pfft_flags);
 
 Plan a parallel, complex input/output discrete Fourier transform (DFT)
-in two or more dimensions, returning an . The planner returns NULL if
-the plan cannot be created.
+in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
 
 Arguments:
 
-, , , , , , , must be the same as passed to the corresponding function,
-see Section [sec:local-size-c2c].
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft`` function, see Section [sec:local-size-c2c].
 
-The array of size specifies the serial transforms that will be omitted.
-For set if the -th serial transformation should be computed, otherwise
-set , see Section [sec:skip-trafo] for more details.
+The array ``skip_trafos`` of size ``rnk_pm``\ 1+ specifies the serial
+transforms that will be omitted. For ``t=0,...,rnk_pm`` set
+``skip_trafos[t]=1`` if the ``t``-th serial transformation should be
+computed, otherwise set ``skip_trafos[t]=0``, see
+Section [sec:skip-trafo] for more details.
 
-and point to the complex valued input and output arrays of the
-transform, which may be the same (yielding an in-place transform). These
-arrays are overwritten during planning, unless is used in the flags.
-(The arrays need not be initialized, but they must be allocated.)
+``in`` and ``out`` point to the complex valued input and output arrays
+of the transform, which may be the same (yielding an in-place
+transform). These arrays are overwritten during planning, unless
+``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The arrays need
+not be initialized, but they must be allocated.)
 
-is the sign of the exponent in the formula that defines the Fourier
-transform. It can be -1 (= ) or +1 (= ).
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``).
 
-is a bitwise OR (’’) of zero or more planner flags, as defined in
-Section [sec:flags].
+``pfft_flags`` is a bitwise OR (’``|``\ ’) of zero or more planner
+flags, as defined in Section [sec:flags].
 
 PFFT computes an unnormalized transform: computing a forward followed by
 a backward transform (or vice versa) will result in the original data
 multiplied by the size of the transform (the product of the dimensions
-).
+``n[t]``).
 
 Real-to-Complex FFT
 ~~~~~~~~~~~~~~~~~~~
@@ -458,25 +486,27 @@ Real-to-Complex FFT
         int sign, unsigned pfft_flags);
 
 Plan a parallel, real-input/complex-output discrete Fourier transform
-(DFT) in two or more dimensions, returning an . The planner returns NULL
-if the plan cannot be created.
+(DFT) in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
 
 Arguments:
 
-, , , , , , , must be the same as passed to the corresponding function,
-see Section [sec:local-size-r2c].
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft_r2c`` function, see Section [sec:local-size-r2c].
 
-and point to the real valued input and complex valued output arrays of
-the transform, which may be the same (yielding an in-place transform).
-These arrays are overwritten during planning, unless is used in the
-flags. (The arrays need not be initialized, but they must be allocated.)
+``in`` and ``out`` point to the real valued input and complex valued
+output arrays of the transform, which may be the same (yielding an
+in-place transform). These arrays are overwritten during planning,
+unless ``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The
+arrays need not be initialized, but they must be allocated.)
 
-is the sign of the exponent in the formula that defines the Fourier
-transform. It can be -1 (= ) or +1 (= ). Note that this parameter is not
-part of the FFTW-MPI interface, where r2c transforms are defined to be
-forward transforms. However, the backward transform can be easily
-realized by an additional conjugation of the complex outputs as done by
-PFFT.
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``). Note that this parameter is not part of the FFTW-MPI
+interface, where r2c transforms are defined to be forward transforms.
+However, the backward transform can be easily realized by an additional
+conjugation of the complex outputs as done by PFFT.
 
 Complex-to-Real FFT
 ~~~~~~~~~~~~~~~~~~~
@@ -501,25 +531,27 @@ Complex-to-Real FFT
         int sign, unsigned pfft_flags);
 
 Plan a parallel, complex-input/real-output discrete Fourier transform
-(DFT) in two or more dimensions, returning an . The planner returns NULL
-if the plan cannot be created.
+(DFT) in two or more dimensions, returning an ``pfft_plan``. The planner
+returns NULL if the plan cannot be created.
 
 Arguments:
 
-, , , , , , , must be the same as passed to the corresponding function,
-see Section [sec:local-size-c2r].
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_dft_c2r`` function, see Section [sec:local-size-c2r].
 
-and point to the complex valued input and real valued output arrays of
-the transform, which may be the same (yielding an in-place transform).
-These arrays are overwritten during planning, unless is used in the
-flags. (The arrays need not be initialized, but they must be allocated.)
+``in`` and ``out`` point to the complex valued input and real valued
+output arrays of the transform, which may be the same (yielding an
+in-place transform). These arrays are overwritten during planning,
+unless ``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The
+arrays need not be initialized, but they must be allocated.)
 
-is the sign of the exponent in the formula that defines the Fourier
-transform. It can be -1 (= ) or +1 (= ). Note that this parameter is not
-part of the FFTW-MPI interface, where c2r transforms are defined to be
-backward transforms. However, the forward transform can be easily
-realized by an additional conjugation of the complex inputs as done by
-PFFT.
+``sign`` is the sign of the exponent in the formula that defines the
+Fourier transform. It can be -1 (= ``PFFT_FORWARD``) or +1 (=
+``PFFT_BACKWARD``). Note that this parameter is not part of the FFTW-MPI
+interface, where c2r transforms are defined to be backward transforms.
+However, the forward transform can be easily realized by an additional
+conjugation of the complex inputs as done by PFFT.
 
 Real-to-Real FFT
 ~~~~~~~~~~~~~~~~
@@ -544,23 +576,26 @@ Real-to-Real FFT
         const pfft_r2r_kind *kinds, unsigned pfft_flags);
 
 Plan a parallel, real input/output (r2r) transform in two or more
-dimensions, returning an . The planner returns NULL if the plan cannot
-be created.
+dimensions, returning an ``pfft_plan``. The planner returns NULL if the
+plan cannot be created.
 
 Arguments:
 
-, , , , , , , must be the same as passed to the corresponding function,
-see Section [sec:local-size-r2r].
+``rnk_n``, ``n``, ``ni``, ``no``, ``howmany``, ``iblock``, ``oblock``,
+``comm_cart`` must be the same as passed to the corresponding
+``pfft_local_size_r2r`` function, see Section [sec:local-size-r2r].
 
-and point to the real valued input and output arrays of the transform,
-which may be the same (yielding an in-place transform). These arrays are
-overwritten during planning, unless is used in the flags. (The arrays
-need not be initialized, but they must be allocated.)
+``in`` and ``out`` point to the real valued input and output arrays of
+the transform, which may be the same (yielding an in-place transform).
+These arrays are overwritten during planning, unless
+``PFFT_ESTIMATE | PFFT_NO_TUNE`` is used in the flags. (The arrays need
+not be initialized, but they must be allocated.)
 
-The array of length specifies the kind of r2r transform that is computed
-in the corresponding dimensions. Just like FFTW-MPI we compute the
-separable product formed by taking each transform kind along the
-corresponding dimension, one dimension after another.
+The array ``kinds`` of length ``rnk_n`` specifies the kind of r2r
+transform that is computed in the corresponding dimensions. Just like
+FFTW-MPI we compute the separable product formed by taking each
+transform kind along the corresponding dimension, one dimension after
+another.
 
 FFT Execution Timer
 -------------------
@@ -572,8 +607,8 @@ Basis Run Time Measurements
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 PFFT-plans automatically accumulate the local run times of every call to
-. For most applications it is sufficient to print run time of a plan
-averaged over all runs with
+``pfft_execute``. For most applications it is sufficient to print run
+time of a plan ``ths`` averaged over all runs with
 
 ::
 
@@ -581,16 +616,17 @@ averaged over all runs with
         const pfft_plan ths, MPI_Comm comm);
 
 Note, that for each timer the maximum time over all processes is reduced
-to rank of communicator , i.e., a call to is performed and the output is
-only printed on this process. The following function works in the same
-way but prints more verbose output
+to rank ``0`` of communicator ``comm``, i.e., a call to ``MPI_Reduce``
+is performed and the output is only printed on this process. The
+following function works in the same way but prints more verbose output
 
 ::
 
     void pfft_print_average_timer_adv(
         const pfft_plan ths, MPI_Comm comm);
 
-To write the averaged run time of plan into a file called use
+To write the averaged run time of plan ``ths`` into a file called
+``name`` use
 
 ::
 
@@ -599,7 +635,8 @@ To write the averaged run time of plan into a file called use
     void pfft_write_average_timer_adv(
         const pfft_plan ths, const char *name, MPI_Comm comm);
 
-Again, the output is only written on rank of communicator .
+Again, the output is only written on rank ``0`` of communicator
+``comm``.
 
 Discard all the recorded run times with
 
@@ -614,16 +651,17 @@ creation function.
 Advanced Timer Manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-In order to access the run times directly a new typedef is introduced.
-The following function returns a copy of the timer corresponding to PFFT
-plan
+In order to access the run times directly a new typedef ``pfft_timer``
+is introduced. The following function returns a copy of the timer
+corresponding to PFFT plan ``ths``
 
 ::
 
     pfft_timer pfft_get_timer(
         const pfft_plan ths);
 
-Note that the memory of the returned must be released with
+Note that the memory of the returned ``pfft_timer`` must be released
+with
 
 ::
 
@@ -633,74 +671,75 @@ Note that the memory of the returned must be released with
 as soon as the timer is not needed anymore.
 
 In the following we introduce some routines to perform basic operations
-on timers. For all functions with a return value you must use in order
-to release the allocated memory of the timer. Create a copy of a
-PFFT-timer with
+on timers. For all functions with a ``pfft_timer`` return value you must
+use ``pfft_destroy_timer`` in order to release the allocated memory of
+the timer. Create a copy of a PFFT-timer ``orig`` with
 
 ::
 
     pfft_timer pfft_copy_timer(
         const pfft_timer orig);
 
-Compute the average, local time over all runs of a timer with
+Compute the average, local time over all runs of a timer ``ths`` with
 
 ::
 
     void pfft_average_timer(
         pfft_timer ths);
 
-Create a new timer that contains the sum of two timers and with
+Create a new timer that contains the sum of two timers ``sum1`` and
+``sum2`` with
 
 ::
 
     pfft_timer pfft_add_timers(
         const pfft_timer sum1, const pfft_timer sum2);
 
-Create a timer that contains the maximum times of all the timers from
-all processes belonging to communicator with
+Create a timer that contains the maximum times of all the timers ``ths``
+from all processes belonging to communicator ``comm`` with
 
 ::
 
     pfft_timer pfft_reduce_max_timer(
         const pfft_timer ths, MPI_Comm comm);
 
-Since this function calls , only the first process (rank 0) of will get
-the desired data while all the other processes have timers with
-undefined values.
+Since this function calls ``MPI_Reduce``, only the first process (rank
+0) of ``comm`` will get the desired data while all the other processes
+have timers with undefined values.
 
 Note, that you can not access the elements of a timer directly, since it
-is only a pointer to a . However, PFFT offers a routine that creates an
-array and copies all the entries of the timer into it
+is only a pointer to a ``struct``. However, PFFT offers a routine that
+creates an array and copies all the entries of the timer into it
 
 ::
 
     double* pfft_convert_timer2vec(
         const pfft_timer ths);
 
-Remember to use in order to release the allocated memory of the returned
-array at the moment it is not needed anymore. The entries of the
-returned array are ordered as follows:
+Remember to use ``free`` in order to release the allocated memory of the
+returned array at the moment it is not needed anymore. The entries of
+the returned array are ordered as follows:
 
-dimension of the process mesh
+dimension of the process mesh ``rnk_pm``
 
-number of serial trafos
+number of serial trafos ``rnk_trafo``
 
-number of global remaps
+number of global remaps ``rnk_remap``
 
-number of runs
+number of ``pfft_execute`` runs ``iter``
 
 local run time of all runs
 
-local times of the serial trafos
+``rnk_n`` local times of the serial trafos
 
-local times of the global remaps
+``rnk_remap`` local times of the global remaps
 
 2 times of the global remaps that are only necessary for
 three-dimensional FFTs on three-dimensional process meshes
 
-time for computing twiddled input (as needed for )
+time for computing twiddled input (as needed for ``PFFT_SHIFTED_OUT``)
 
-time for computing twiddled output (as needed for )
+time for computing twiddled output (as needed for ``PFFT_SHIFTED_IN``)
 
 The complementary function
 
@@ -709,8 +748,9 @@ The complementary function
     pfft_timer pfft_convert_vec2timer(
         const double *times);
 
-creates a timer and fills it’s entries with the data from array .
-Thereby, the entries of must be in the same order as above.
+creates a timer and fills it’s entries with the data from array
+``times``. Thereby, the entries of ``times`` must be in the same order
+as above.
 
 Ghost Cell Communication
 ------------------------
@@ -719,16 +759,18 @@ In the following we describe the PFFT ghost cell communication module.
 At the moment, PFFT ghost cell communication is restricted to
 three-dimensional arrays.
 
-Assume a three-dimensional array of size that is distributed in blocks
-such that each process has a local copy of with
+Assume a three-dimensional array ``data`` of size ``n[3]`` that is
+distributed in blocks such that each process has a local copy of
+``data[k[0],k[1],k[2]]`` with
 
 ::
 
     local_start[t] <= k[t] < local_start[t] + local_n[t]
 
-Here and in the following, we assume . The “classical” ghost cell
-exchange communicates all the necessary data between neighboring
-processes, such that each process gets a local copy of with
+Here and in the following, we assume ``t=0,1,2``. The “classical” ghost
+cell exchange communicates all the necessary data between neighboring
+processes, such that each process gets a local copy of
+``data[k[0],k[1],k[2]]`` with
 
 ::
 
@@ -741,31 +783,34 @@ where
     local_gc_start[t] = local_start[t] - gc_below[t];
     local_ngc[t] = local_n[t] + gc_below[t] + gc_above[t];
 
-I.e., the local array block is increased in every dimension by elements
-below and elements above. Hereby, the is wrapped periodically whenever
-exceeds the array dimensions. The number of ghost cells in every
-dimension can be chosen independently and can be arbitrary large, i.e.,
-PFFT ghost cell communication also handles the case where the requested
-data exceeds next neighbor communication. The number of ghost cells can
-even be bigger than the array size, which results in multiple local
-copies of the same data elements at every process. However, the arrays
-and must be equal among all MPI processes.
+I.e., the local array block is increased in every dimension by
+``gc_below`` elements below and ``gc_above`` elements above. Hereby, the
+``data`` is wrapped periodically whenever ``k[t]`` exceeds the array
+dimensions. The number of ghost cells in every dimension can be chosen
+independently and can be arbitrary large, i.e., PFFT ghost cell
+communication also handles the case where the requested data exceeds
+next neighbor communication. The number of ghost cells can even be
+bigger than the array size, which results in multiple local copies of
+the same data elements at every process. However, the arrays
+``gc_below`` and ``gc_above`` must be equal among all MPI processes.
 
 PFFT ghost cell communication can work on both, the input and output
-array distributions. Substitute and by and if you are interested in
-ghost cell communication of the input array. For ghost cell
-communication of the output array, substitute and by and .
+array distributions. Substitute ``local_n`` and ``local_start`` by
+``local_ni`` and ``local_i_start`` if you are interested in ghost cell
+communication of the input array. For ghost cell communication of the
+output array, substitute ``local_n`` and ``local_start`` by ``local_no``
+and ``local_o_start``.
 
 Using Ghost Cell Plans
 ~~~~~~~~~~~~~~~~~~~~~~
 
-We introduce a new datatype that stores all the necessary information
-for ghost cell communication. Using a ghost cell plan follows the
-typical workflow: At first, determine the parallel data distribution;
-cf. Section [sec:gc:local-size]. Next, create a ghost cell plan; cf.
-Section [sec:gc:plan-cdata] and Section [sec:gc:plan-rdata]. Execute the
-ghost cell communication with one of the following two collective
-functions
+We introduce a new datatype ``pfft_gcplan`` that stores all the
+necessary information for ghost cell communication. Using a ghost cell
+plan follows the typical workflow: At first, determine the parallel data
+distribution; cf. Section [sec:gc:local-size]. Next, create a ghost cell
+plan; cf. Section [sec:gc:plan-cdata] and Section [sec:gc:plan-rdata].
+Execute the ghost cell communication with one of the following two
+collective functions
 
 ::
 
@@ -785,8 +830,8 @@ Finally, free the allocated memory with
     void pfft_destroy_gcplan(
         pfft_gcplan ths);
 
-if the plan is not needed anymore. Passing a freed plan to or results in
-undefined behavior.
+if the plan is not needed anymore. Passing a freed plan to
+``pfft_exchange`` or ``pfft_reduce`` results in undefined behavior.
 
 Data Distribution
 ~~~~~~~~~~~~~~~~~
@@ -813,25 +858,30 @@ distribution:
         const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
         ptrdiff_t *local_ngc, ptrdiff_t *local_gc_start);
 
-Hereby, and must be the exactly same variables that were used for the
-PFFT plan creation. However, only the case is completely implemented at
-the moment. The local array size must be equal to or (computed by an
-appropriate call of ; cf. Section [sec:local-size]) depending on whether
-the ghost cell plan works on the FFT input or output array. Analogously,
-becomes or . The number of ghost cells is given by the two arrays and
-that must be equal among all MPI processes. All the ghost cell data
-distribution functions return the local array plus ghost cell size and
-the corresponding offset as two arrays of length . In addition, the
-return value gives the number of data elements that are necessary in
-order to store the array plus ghost cells.
+Hereby, ``rnk_n`` and ``howmany`` must be the exactly same variables
+that were used for the PFFT plan creation. However, only the case
+``rnk_n==3`` is completely implemented at the moment. The local array
+size ``local_n`` must be equal to ``local_ni`` or ``local_no`` (computed
+by an appropriate call of ``pfft_local_size``; cf.
+Section [sec:local-size]) depending on whether the ghost cell plan works
+on the FFT input or output array. Analogously, ``local_start`` becomes
+``local_i_start`` or ``local_o_start``. The number of ghost cells is
+given by the two arrays ``gc_below`` and ``gc_above`` that must be equal
+among all MPI processes. All the ghost cell data distribution functions
+return the local array plus ghost cell size ``local_ngc`` and the
+corresponding offset ``local_gc_start`` as two arrays of length
+``rnk_n``. In addition, the ``ptrdiff_t`` return value gives the number
+of data elements that are necessary in order to store the array plus
+ghost cells.
 
 Note, that the array distribution functions do not distinguish between
-real and complex valued data. That is because and count array elements
-in units of complex or real depending on the transform. In addition, it
-does not matter if the local array is transposed or not, i.e., it is not
-necessary to pass the flags and to the ghost cell distribution function.
-In constrast, the ghost cell plan creation depends on the transform type
-as well as the transposition flags.
+real and complex valued data. That is because ``local_n`` and
+``local_start`` count array elements in units of complex or real
+depending on the transform. In addition, it does not matter if the local
+array is transposed or not, i.e., it is not necessary to pass the flags
+``PFFT_TRANSPOSED_IN`` and ``PFFT_TRANSPOSED_OUT`` to the ghost cell
+distribution function. In constrast, the ghost cell plan creation
+depends on the transform type as well as the transposition flags.
 
 Memory Allocation
 ~~~~~~~~~~~~~~~~~
@@ -857,11 +907,12 @@ allocation of memory in for complex valued and real valued arrays.
     /* Allocate enough memory for FFT and ghost cells */
     pfft_complex *cdata = pfft_alloc_complex(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local);
 
-Here, gives the number of data elements that are necessary to hold all
-steps of the parallel FFT, while gives the number of data elements that
-are necessary to hold all steps of the ghost cell communication. Note
-that we took the maximum of these both numbers as argument for . The
-code snippet for real valued arrays looks very similar.
+Here, ``alloc_local`` gives the number of data elements that are
+necessary to hold all steps of the parallel FFT, while
+``alloc_local_gc`` gives the number of data elements that are necessary
+to hold all steps of the ghost cell communication. Note that we took the
+maximum of these both numbers as argument for ``pfft_alloc_complex``.
+The code snippet for real valued arrays looks very similar.
 
 ::
 
@@ -880,7 +931,8 @@ code snippet for real valued arrays looks very similar.
     double *rdata = pfft_alloc_real(alloc_local_gc > 2*alloc_local ? alloc_local_gc : 2*alloc_local);
 
 Note that the number of real valued data elements is given by two times
-for r2c transforms, whereas the last line would change into
+``alloc_local`` for r2c transforms, whereas the last line would change
+into
 
 ::
 
@@ -898,9 +950,9 @@ c2c inputs,
 
 c2c outputs,
 
-r2c outputs (use flag ), and
+r2c outputs (use flag ``PFFT_GC_C2R``), and
 
-c2r inputs (use flag ).
+c2r inputs (use flag ``PFFT_GC_R2C``).
 
 Corresponding to the three interface layers for FFT planning, there are
 the following three layers for creating a complex valued ghost cell
@@ -922,34 +974,41 @@ plan:
         const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
         pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags);
 
-Hereby, , , and must be the variables that were used for the PFFT plan
-creation. However, only the case is completely implemented at the
-moment. Remember that is the logical FFT size just as it is the case for
-FFT planning. The block size must be equal to or depending on whether
-the ghost cell plan works on the FFT input or output array. Analogously,
-becomes or . Set the number of ghost cells by and as described in
-Section [sec:gc]. The flags must be set appropriately to the flags that
-were passed to the FFT planner. Table [tab:map-cgcflags] shows the ghost
-cell planner flags that must be set in dependence on the listed FFT
-planner flags.
+Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the
+variables that were used for the PFFT plan creation. However, only the
+case ``rnk_n==3`` is completely implemented at the moment. Remember that
+``n`` is the logical FFT size just as it is the case for FFT planning.
+The block size ``block`` must be equal to ``iblock`` or ``oblock``
+depending on whether the ghost cell plan works on the FFT input or
+output array. Analogously, ``data`` becomes ``in`` or ``out``. Set the
+number of ghost cells by ``gc_below`` and ``gc_above`` as described in
+Section [sec:gc]. The flags ``gc_flags`` must be set appropriately to
+the flags that were passed to the FFT planner. Table [tab:map-cgcflags]
+shows the ghost cell planner flags that must be set in dependence on the
+listed FFT planner flags.
 
 [h]
 
-+------------+-------------------+
-| FFT flag   | ghost cell flag   |
-+============+===================+
-+------------+-------------------+
-+------------+-------------------+
-+------------+-------------------+
++----------------------------+-------------------------------+
+| FFT flag                   | ghost cell flag               |
++============================+===============================+
+| ``PFFT_TRANSPOSED_NONE``   | ``PFFT_GC_TRANSPOSED_NONE``   |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_IN``     | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_OUT``    | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
 
 [tab:map-cgcflags]
 
-In addition, we introduce the flag (and its equivalent ) to handle the
-complex array storage format of r2c and c2r transforms. In fact, these
-two flags imply an ordinary complex valued ghost cell communication on
-an array of size . Please note that we wrongly assume periodic boundary
-conditions in this case. Therefore, you should ignore the data elements
-with the last index behind .
+In addition, we introduce the flag ``PFFT_GC_R2C`` (and its equivalent
+``PFFT_GC_C2R``) to handle the complex array storage format of r2c and
+c2r transforms. In fact, these two flags imply an ordinary complex
+valued ghost cell communication on an array of size
+``n[0] x ... x n[rnk_n-2] x (n[rnk_n-1]/2``\ 1)+. Please note that we
+wrongly assume periodic boundary conditions in this case. Therefore, you
+should ignore the data elements with the last index behind
+``n[rnk_n-1]/2``.
 
 Plan Creation for Real Data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -984,34 +1043,43 @@ the following three layers for creating a real valued ghost cell plan:
         const ptrdiff_t *gc_below, const ptrdiff_t *gc_above,
         double *data, MPI_Comm comm_cart, unsigned gc_flags);
 
-Hereby, , , and must be the variables that were used for the PFFT plan
-creation. Remember that is the logical FFT size just as it is the case
-for FFT planning. The block size must be equal to or depending on
-whether the ghost cell plan works on the FFT input or output array.
-Analogously, becomes or . Set the number of ghost cells by and as
-described in Section [sec:gc:local-size]. The flags must be set
+Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the
+variables that were used for the PFFT plan creation. Remember that ``n``
+is the logical FFT size just as it is the case for FFT planning. The
+block size ``block`` must be equal to ``iblock`` or ``oblock`` depending
+on whether the ghost cell plan works on the FFT input or output array.
+Analogously, ``data`` becomes ``in`` or ``out``. Set the number of ghost
+cells by ``gc_below`` and ``gc_above`` as described in
+Section [sec:gc:local-size]. The flags ``gc_flags`` must be set
 appropriately to the flags that were passed to the FFT planner.
 Table [tab:map-rgcflags] shows the ghost cell planner flags that must be
 set in dependence on the listed FFT planner flags.
 
 [h]
 
-+------------+-------------------+
-| FFT flag   | ghost cell flag   |
-+============+===================+
-+------------+-------------------+
-+------------+-------------------+
-+------------+-------------------+
-+------------+-------------------+
-+------------+-------------------+
++----------------------------+-------------------------------+
+| FFT flag                   | ghost cell flag               |
++============================+===============================+
+| ``PFFT_TRANSPOSED_NONE``   | ``PFFT_GC_TRANSPOSED_NONE``   |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_IN``     | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_TRANSPOSED_OUT``    | ``PFFT_GC_TRANSPOSED``        |
++----------------------------+-------------------------------+
+| ``PFFT_PADDED_R2C``        | ``PFFT_GC_PADDED_R2C``        |
++----------------------------+-------------------------------+
+| ``PFFT_PADDED_C2R``        | ``PFFT_GC_PADDED_C2R``        |
++----------------------------+-------------------------------+
 
 [tab:map-rgcflags]
 
-Note that the flag (or its equivalent ) implies an ordinary real valued
-ghost cell communication on an array of size . Especially, the padding
-elements will be handles as normal data points, i.e., you must we aware
-that the numbers of ghost cells and include the number of padding
-elements.
+Note that the flag ``PFFT_GC_PADDED_R2C`` (or its equivalent
+``PFFT_GC_PADDED_C2R``) implies an ordinary real valued ghost cell
+communication on an array of size
+``n[0] x ... x n[rnk_n-2] x 2*(n[rnk_n-1]/2``\ 1)+. Especially, the
+padding elements will be handles as normal data points, i.e., you must
+we aware that the numbers of ghost cells ``gc_below[rnk_n-1]`` and
+``gc_above[rnk_n-1]`` include the number of padding elements.
 
 Inofficial Flags
 ~~~~~~~~~~~~~~~~
@@ -1020,8 +1088,9 @@ Ghost Cell Execution Timer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 PFFT ghost cell plans automatically accumulate the local run times of
-every call to and . For most applications it is sufficient to print run
-time of a plan averaged over all runs with
+every call to ``pfft_exchange`` and ``pfft_reduce``. For most
+applications it is sufficient to print run time of a plan ``ths``
+averaged over all runs with
 
 ::
 
@@ -1029,17 +1098,17 @@ time of a plan averaged over all runs with
         const pfft_gcplan ths, MPI_Comm comm);
 
 Note, that for each timer the maximum time over all processes is reduced
-to rank of communicator , i.e., a call to is performed and the output is
-only printed on this process. The following function works in the same
-way but prints more verbose output
+to rank ``0`` of communicator ``comm``, i.e., a call to ``MPI_Reduce``
+is performed and the output is only printed on this process. The
+following function works in the same way but prints more verbose output
 
 ::
 
     void pfft_print_average_gctimer_adv(
         const pfft_gcplan ths, MPI_Comm comm);
 
-To write the averaged run time of a ghost cell plan into a file called
-use
+To write the averaged run time of a ghost cell plan ``ths`` into a file
+called ``name`` use
 
 ::
 
@@ -1048,7 +1117,8 @@ use
     void pfft_write_average_gctimer_adv(
         const pfft_gcplan ths, const char *name, MPI_Comm comm);
 
-Again, the output is only written on rank of communicator .
+Again, the output is only written on rank ``0`` of communicator
+``comm``.
 
 Discard all the recorded run times with
 
@@ -1060,10 +1130,10 @@ Discard all the recorded run times with
 This function is called per default at the end of every ghost cell plan
 creation function.
 
-In order to access the run times directly a new typedef is introduced.
-The following functions return a copy of the timer corresponding to
-ghost cell plan that accumulated the time for ghost cell exchange or
-ghost cell reduce, respectively:
+In order to access the run times directly a new typedef ``pfft_timer``
+is introduced. The following functions return a copy of the timer
+corresponding to ghost cell plan ``ths`` that accumulated the time for
+ghost cell exchange or ghost cell reduce, respectively:
 
 ::
 
@@ -1072,7 +1142,8 @@ ghost cell reduce, respectively:
     pfft_gctimer pfft_get_gctimer_red(
         const pfft_gcplan ths);
 
-Note that the memory of the returned must be released with
+Note that the memory of the returned ``pfft_gctimer`` must be released
+with
 
 ::
 
@@ -1082,55 +1153,56 @@ Note that the memory of the returned must be released with
 as soon as the timer is not needed anymore.
 
 In the following we introduce some routines to perform basic operations
-on timers. For all functions with a return value you must use in order
-to release the allocated memory of the timer. Create a copy of a ghost
-cell timer with
+on timers. For all functions with a ``pfft_gctimer`` return value you
+must use ``pfft_destroy_gctimer`` in order to release the allocated
+memory of the timer. Create a copy of a ghost cell timer ``orig`` with
 
 ::
 
     pfft_gctimer pfft_copy_gctimer(
         const pfft_gctimer orig);
 
-Compute the average, local time over all runs of a timer with
+Compute the average, local time over all runs of a timer ``ths`` with
 
 ::
 
     void pfft_average_gctimer(
         pfft_gctimer ths);
 
-Create a new timer that contains the sum of two timers and with
+Create a new timer that contains the sum of two timers ``sum1`` and
+``sum2`` with
 
 ::
 
     pfft_gctimer pfft_add_gctimers(
         const pfft_gctimer sum1, const pfft_gctimer sum2);
 
-Create a timer that contains the maximum times of all the timers from
-all processes belonging to communicator with
+Create a timer that contains the maximum times of all the timers ``ths``
+from all processes belonging to communicator ``comm`` with
 
 ::
 
     pfft_gctimer pfft_reduce_max_gctimer(
         const pfft_gctimer ths, MPI_Comm comm);
 
-Since this function calls , only the first process (rank 0) of will get
-the desired data while all the other processes have timers with
-undefined values.
+Since this function calls ``MPI_Reduce``, only the first process (rank
+0) of ``comm`` will get the desired data while all the other processes
+have timers with undefined values.
 
 Note, that you can not access the elements of a timer directly, since it
-is only a pointer to a . However, PFFT offers a routine that creates an
-array and copies all the entries of the timer into it
+is only a pointer to a ``struct``. However, PFFT offers a routine that
+creates an array and copies all the entries of the timer into it
 
 ::
 
     void pfft_convert_gctimer2vec(
         const pfft_gctimer ths, double *times);
 
-Remember to use in order to release the allocated memory of the returned
-array at the moment it is not needed anymore. The entries of the
-returned array are ordered as follows:
+Remember to use ``free`` in order to release the allocated memory of the
+returned array at the moment it is not needed anymore. The entries of
+the returned array are ordered as follows:
 
-number of runs
+number of ``pfft_execute`` runs ``iter``
 
 local run time of all runs
 
@@ -1147,8 +1219,9 @@ The complementary function
     pfft_gctimer pfft_convert_vec2gctimer(
         const double *times);
 
-creates a timer and fills it’s entries with the data from array .
-Thereby, the entries of must be in the same order as above.
+creates a timer and fills it’s entries with the data from array
+``times``. Thereby, the entries of ``times`` must be in the same order
+as above.
 
 Useful Tools
 ------------
@@ -1159,8 +1232,8 @@ to perform parallel FFTs.
 Initializing Complex Inputs and Checking Outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To fill a complex array with reproducible, complex values you can use
-one of the functions
+To fill a complex array ``data`` with reproducible, complex values you
+can use one of the functions
 
 ::
 
@@ -1173,8 +1246,9 @@ one of the functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         pfft_complex *data);
 
-Hereby, the arrays , and of length ( for ) give the size of the FFT, the
-local array size and the local array offset as computed by the array
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` of length
+``rnk_n`` (``rnk_n==3`` for ``_3d``) give the size of the FFT, the local
+array size and the local array offset as computed by the array
 distribution functions described in Section [sec:local-size] The
 functions
 
@@ -1189,9 +1263,10 @@ functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         const pfft_complex *data, MPI_Comm comm);
 
-compute the :math:`l_1`-norm between the elements of array and values
-produced by , . In addition, we supply the following functions for
-setting all the input data to zero at once
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_complex_3d``,
+``pfft_init_input_complex``. In addition, we supply the following
+functions for setting all the input data to zero at once
 
 ::
 
@@ -1241,8 +1316,8 @@ backward FFT instead of being a buggy relict of the forward transform.
 Initializing Real Inputs and Checking Outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To fill a real array with reproducible, real values use one of the
-functions
+To fill a real array ``data`` with reproducible, real values use one of
+the functions
 
 ::
 
@@ -1255,9 +1330,10 @@ functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         double *data);
 
-Hereby, the arrays , and give the size of the FFT, the local array size
-and the local array offset as computed by the array distribution
-functions described in Section [sec:local-size] The functions
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the
+size of the FFT, the local array size and the local array offset as
+computed by the array distribution functions described in
+Section [sec:local-size] The functions
 
 ::
 
@@ -1270,9 +1346,10 @@ functions described in Section [sec:local-size] The functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         const pfft_complex *data, MPI_Comm comm);
 
-compute the :math:`l_1`-norm between the elements of array and values
-produced by , . In addition, we supply the following functions for
-setting all the input data to zero at once
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_real_3d``,
+``pfft_init_input_real``. In addition, we supply the following functions
+for setting all the input data to zero at once
 
 ::
 
@@ -1285,11 +1362,12 @@ setting all the input data to zero at once
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         double *data);
 
-Note, that both functions will set all array elements to zero were . In
-addition, both function will ignore all the errors resulting from these
-elements. Therefore, it is safe to use all these functions for a
-consistency check of a r2c transform followed by a c2r transform since
-all padding elements will be ignored.
+Note, that both ``pfft_init_input_real*`` functions will set all array
+elements to zero were ``local_n `` local\ :sub:`ns`\ tart >= n+. In
+addition, both ``pfft_check_output_real*`` function will ignore all the
+errors resulting from these elements. Therefore, it is safe to use all
+these functions for a consistency check of a r2c transform followed by a
+c2r transform since all padding elements will be ignored.
 
 Initializing r2c/c2r Inputs and Checking Outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1299,15 +1377,16 @@ decribed in Section [sec:init-data-3d-r2r]. However, generating suitable
 inputs for a c2r transform requires more caution. In order to get real
 valued results of a DFT the complex input coefficients need to satisfy
 an radial Hermitian symmetry, i.e.,
-:math:`X[\mathbf k] = {X^*[-\mathbf k]}`. We use the following trick to
-generate the complex input values for c2r transforms. Assume any
-:math:`\mathbf N`-periodic complex valued function :math:`f`. It can be
-easily shown that the values
-:math:`X[\mathbf k] := \frac{1}{2}\left(f(\mathbf k)+f^*(-\mathbf k)\right)`
+:math:`X[{\ensuremath{\boldsymbol{k}}}] = {X^*[-{\ensuremath{\boldsymbol{k}}}]}`.
+We use the following trick to generate the complex input values for c2r
+transforms. Assume any :math:`{\ensuremath{\boldsymbol{N}}}`-periodic
+complex valued function :math:`f`. It can be easily shown that the
+values
+:math:`X[{\ensuremath{\boldsymbol{k}}}] := \frac{1}{2}\left(f({\ensuremath{\boldsymbol{k}}})+f^*(-{\ensuremath{\boldsymbol{k}}})\right)`
 satisfy the radial Hermitian symmetry.
 
-To fill a complex array with reproducible, complex values that fulfill
-the radial Hermitian symmetry use one of the functions
+To fill a complex array ``data`` with reproducible, complex values that
+fulfill the radial Hermitian symmetry use one of the functions
 
 ::
 
@@ -1320,9 +1399,10 @@ the radial Hermitian symmetry use one of the functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         double *data);
 
-Hereby, the arrays , and give the size of the FFT, the local array size
-and the local array offset as computed by the array distribution
-functions described in Section [sec:local-size] The functions
+Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the
+size of the FFT, the local array size and the local array offset as
+computed by the array distribution functions described in
+Section [sec:local-size] The functions
 
 ::
 
@@ -1335,9 +1415,10 @@ functions described in Section [sec:local-size] The functions
         const ptrdiff_t *local_n, const ptrdiff_t *local_start,
         const pfft_complex *data, MPI_Comm comm);
 
-compute the :math:`l_1`-norm between the elements of array and values
-produced by , . In addition, we supply the following functions for
-setting all the input data to zero at once
+compute the :math:`l_1`-norm between the elements of array ``data`` and
+values produced by ``pfft_init_input_complex_hermitian_3d``,
+``pfft_init_input_complex_hermitian``. In addition, we supply the
+following functions for setting all the input data to zero at once
 
 ::
 
@@ -1355,26 +1436,26 @@ inputs with radial Hermitian symmetry for ordinary c2c transforms. Of
 course the results of such a c2c DFT will have all imaginary parts equal
 to zero up to machine precision.
 
-Operations on Arrays of Type 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Operations on Arrays of Type ``ptrdiff_t``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The following routines are shortcuts for the elementwise manipulation of
-valued arrays. In the following, all arrays , , and are of length and
-type .
+``ptrdiff_t`` valued arrays. In the following, all arrays ``vec``,
+``vec1``, and ``vec2`` are of length ``d`` and type ``ptrdiff_t``.
 
 ::
 
     ptrdiff_t pfft_prod_INT(
         int d, const ptrdiff_t *vec);
 
-Returns the product over all elements of .
+Returns the product over all elements of ``vec``.
 
 ::
 
     ptrdiff_t pfft_sum_INT(
         int d, const ptrdiff_t *vec);
 
-Returns the sum over all elements of .
+Returns the sum over all elements of ``vec``.
 
 ::
 
@@ -1389,7 +1470,7 @@ Returns 1 if both arrays have equal entries, 0 otherwise.
         int d, const ptrdiff_t *vec1,
         ptrdiff_t *vec2);
 
-Copies the elements of into .
+Copies the elements of ``vec1`` into ``vec2``.
 
 ::
 
@@ -1397,7 +1478,7 @@ Copies the elements of into .
         int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
         ptrdiff_t *sum);
 
-Fills with the componentwise sum of and .
+Fills ``sum`` with the componentwise sum of ``vec1`` and ``vec2``.
 
 ::
 
@@ -1405,14 +1486,15 @@ Fills with the componentwise sum of and .
         int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2,
         ptrdiff_t *sum);
 
-Fills with the componentwise difference of and .
+Fills ``sum`` with the componentwise difference of ``vec1`` and
+``vec2``.
 
 Print Three-Dimensional Arrays in Parallel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Use the following routine to print the elements of a block decomposed
-three-dimensional (real or complex valued) array in a nicely formatted
-way.
+three-dimensional (real or complex valued) array ``data`` in a nicely
+formatted way.
 
 ::
 
@@ -1426,12 +1508,12 @@ way.
         const char *name, MPI_Comm comm);
 
 Obviously, this makes only sense for arrays of moderate size. The block
-decomposition is given by , as returned by the array distribution
-function decribed in Section [sec:local-size]. Furthermore, some
-arbitrary string can be added at the beginning of each output -
-typically this will be the name of the array. Communicator must be
-suitable to the block decomposition and is used to synchronize the
-outputs over all processes.
+decomposition is given by ``local_n``, ``local_n_start`` as returned by
+the array distribution function decribed in Section [sec:local-size].
+Furthermore, some arbitrary string ``name`` can be added at the
+beginning of each output - typically this will be the name of the array.
+Communicator ``comm`` must be suitable to the block decomposition and is
+used to synchronize the outputs over all processes.
 
 Generalizations for the case where the dimensions of the local arrays
 are permuted are given by
@@ -1449,13 +1531,14 @@ are permuted are given by
         int perm0, int perm1, int perm2,
         const char *name, MPI_Comm comm);
 
-Hereby, , , and give the array’s permutation of dimension.
+Hereby, ``perm0``, ``perm1``, and ``perm2`` give the array’s permutation
+of dimension.
 
 Reading Command Line Arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The following function offers a simple way to read command line
-arguments into an array .
+arguments into an array ``parameter``.
 
 ::
 
@@ -1464,13 +1547,16 @@ arguments into an array .
         int neededArgs, unsigned type,
         void *parameter);
 
-Hereby, and are the standard argument of the routine. Furthermore, , ,
-and give the name, number of entries and the type of the command line
-argument. Supported types are , , , , and , which denote the standard C
-type that is used for typecasting. In addition, you can use the special
-type that is an integer type equal to one if the corresponding command
-line argument is given. The array must be of sufficient size to hold
-elements of the given data type. Special attention is given
+Hereby, ``argc`` and ``argv`` are the standard argument of the ``main``
+routine. Furthermore, ``name``, ``neededAgrs``, and ``type`` give the
+name, number of entries and the type of the command line argument.
+Supported types are ``PFFT_INT``, ``PFFT_PTRDIFF_T``, ``PFFT_FLOAT``,
+``PFFT_DOUBLE``, and ``PFFT_UNSIGNED``, which denote the standard C type
+that is used for typecasting. In addition, you can use the special type
+``PFFT_SWITCH`` that is an integer type equal to one if the
+corresponding command line argument is given. The array ``parameter``
+must be of sufficient size to hold ``neededArgs`` elements of the given
+data type. Special attention is given
 
 For example, a program containing the following code snippet
 
@@ -1491,18 +1577,21 @@ that is executed via
 
     ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on
 
-will read , , , and turn on the . Note the address operator in front of
-in the second line! Furthermore, note that the initialization of all
-variables with default values before the call of avoids trouble if the
-user does not provide all the command line arguments.
+will read ``x=3.1``, ``np[2] = (2,3)``, ``n[3]= (8,16,32)``, and turn on
+the ``switch=1``. Note the address operator ``&`` in front of ``x`` in
+the second line! Furthermore, note that the initialization of all
+variables with default values before the call of ``pfft_get_args``
+avoids trouble if the user does not provide all the command line
+arguments.
 
-Parallel Substitutes for , , and 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Parallel Substitutes for ``vprintf``, ``fprintf``, and ``printf``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The following functions are similar to the standard C function , and
-with the exception, that only rank within the given communicator will
-produce output. The intension is to avoid the flood of messages that is
-produced when simple statement are run in parallel.
+The following functions are similar to the standard C function
+``vfprintf``, ``fprintf`` and ``printf`` with the exception, that only
+rank ``0`` within the given communicator ``comm`` will produce output.
+The intension is to avoid the flood of messages that is produced when
+simple ``printf`` statement are run in parallel.
 
 ::
 
@@ -1516,8 +1605,8 @@ produced when simple statement are run in parallel.
 Generating Periodic Cartesian Communicators
 -------------------------------------------
 
-Based on the processes that are part of the given communicator the
-following routine
+Based on the processes that are part of the given communicator ``comm``
+the following routine
 
 ::
 
@@ -1526,9 +1615,10 @@ following routine
         MPI_Comm *comm_cart_1d);
 
 allocates and creates a one-dimensional, periodic, Cartesian
-communicator of size . Thereby, a non-zero error code is returned
-whenever does not fit the size of . The memory of the generated
-communicator should be released with after usage. Analogously, use
+communicator ``comm_cart_1d`` of size ``np0``. Thereby, a non-zero error
+code is returned whenever ``np0`` does not fit the size of ``comm``. The
+memory of the generated communicator should be released with
+``MPI_Comm_free`` after usage. Analogously, use
 
 ::
 
@@ -1537,7 +1627,7 @@ communicator should be released with after usage. Analogously, use
         MPI_Comm *comm_cart_2d);
 
 in order to allocate and create two-dimensional, periodic, Cartesian
-communicator of size or
+communicator ``comm_cart_2d`` of size ``np0*np1`` or
 
 ::
 
@@ -1545,6 +1635,8 @@ communicator of size or
         int rnk_np, MPI_Comm comm, const int *np,
         MPI_Comm *comm_cart);
 
-in order to allocate and create a -dimensional, periodic, Cartesian
-communicator of size . Hereby, is an array of length . Again, the memory
-of the generated communicator should be released with after usage.
+in order to allocate and create a ``rnk_np``-dimensional, periodic,
+Cartesian communicator of size ``np[0]*np[1]*...*np[rnk_np-1]``. Hereby,
+``np`` is an array of length ``rnk_np``. Again, the memory of the
+generated communicator should be released with ``MPI_Comm_free`` after
+usage.
diff --git a/doc/reference.tex b/doc/reference.tex
index 77d8a74..315148b 100644
--- a/doc/reference.tex
+++ b/doc/reference.tex
@@ -1177,7 +1177,7 @@ \subsection{Reading Command Line Arguments}
 \begin{lstlisting}
 ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on
 \end{lstlisting}
-will read \code{x=3.1}, \code{np[2] = \{2,3\}}, \code{n[3]=\{8,16,32\}}, and turn on the \code{switch=1}.
+will read \code{x=3.1}, \code{np[2] = (2,3)}, \code{n[3]= (8,16,32)}, and turn on the \code{switch=1}.
 Note the address operator \code{&} in front of \code{x} in the second line!
 Furthermore, note that the initialization of all variables with default values before the call of \code{pfft_get_args}
 avoids trouble if the user does not provide all the command line arguments.
diff --git a/doc/shortcuts.tex b/doc/shortcuts.tex
index 7d664e5..c434946 100644
--- a/doc/shortcuts.tex
+++ b/doc/shortcuts.tex
@@ -183,7 +183,8 @@
   %       belowskip= -2ex}
   % 
 %   \newcommand{\code}[1]{\linebreak[2]{\ttfamily #1}}
-  \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}}
+   \newcommand{\code}[1]{:code:`#1`}
+%  \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}}
 }
 
 %% own enviroment for case differentiation
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index dd1a641..d862765 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1,3 +1,7 @@
+[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3
+
+#1
+
 Tutorial
 ========
 
@@ -11,11 +15,12 @@ A first parallel transform - Three-dimensional FFT with two-dimensional data dec
 We explain the basic steps for computing a parallel FFT with the PFFT
 library at the example of the short test program given by
 Listing [lst:man\ :sub:`c`\ 2c]. This test computes a three-dimensional
-c2c-FFT on a two-dimensional process mesh. The source code can be found
-in directory of the library’s source code tree.
+c2c-FFT on a two-dimensional process mesh. The source code
+``manual_c2c_3d.c`` can be found in directory ``tests/`` of the
+library’s source code tree.
 
-After initializing MPI with and before calling any other PFFT routine
-initialize the parallel FFT computations via
+After initializing MPI with ``MPI_Init`` and before calling any other
+PFFT routine initialize the parallel FFT computations via
 
 ::
 
@@ -33,12 +38,14 @@ the following routine
         MPI_Comm comm, int np0, int np1,
         MPI_Comm *comm_cart_2d);
 
-This routine uses the processes within the communicator to create a
-two-dimensional process grid of size x and stores it into the Cartesian
-communicator . Note that is allocated by the routine and must be freed
-with after usage. The input parameter is a communicator, indicating
-which processes will participate in the transform. Choosing as implies
-that the FFT is computed on all available processes.
+This routine uses the processes within the communicator ``comm`` to
+create a two-dimensional process grid of size ``np0`` x ``np1`` and
+stores it into the Cartesian communicator ``comm_cart_2d``. Note that
+``comm_cart_2d`` is allocated by the routine and must be freed with
+``MPI_Comm_free`` after usage. The input parameter ``comm`` is a
+communicator, indicating which processes will participate in the
+transform. Choosing ``comm`` as ``MPI_COMM_WORLD`` implies that the FFT
+is computed on all available processes.
 
 At the next step we need to know the data decomposition of the input and
 output array, that depends on the array sizes, the process grid and the
@@ -51,20 +58,24 @@ chosen parallel algorithm. Therefore, we call
         ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
         ptrdiff_t *local_no, ptrdiff_t *local_o_start);
 
-Hereby, , , , , are arrays of length :math:`3` that must be allocated.
+Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``,
+``local_o_start`` are arrays of length :math:`3` that must be allocated.
 The return value of this function equals the size of the local complex
 array that needs to be allocated by every process. In most cases, this
 coincides with the product of the local array sizes – but may be bigger,
 whenever the parallel algorithm needs some extra storage. The input
-value gives the three-dimensional FFT size and the flag serves to adjust
-some details of the parallel execution. For the sake of simplicity, we
-restrict our self to the case for a while and explain the more
-sophisticated flags at a later point. The output arrays and give the
-size and the offset of the local input array that result from the
-parallel block distribution of the global input array, i.e., every
-process owns the input data with for . Analogously, the output
-parameters and contain the size and the offset of the local output
-array.
+value ``n`` gives the three-dimensional FFT size and the flag
+``pfft_flags`` serves to adjust some details of the parallel execution.
+For the sake of simplicity, we restrict our self to the case
+``pfft_flags=PFFT_TRANSPOSED_NONE`` for a while and explain the more
+sophisticated flags at a later point. The output arrays ``local_ni`` and
+``local_i_start`` give the size and the offset of the local input array
+that result from the parallel block distribution of the global input
+array, i.e., every process owns the input data ``in[k[0],k[1],k[2]]``
+with ``local_i_start[t] <= k[t] < local_i_start[t] ``
+local\ :sub:`n`\ i[t]+ for ``t=0,1,2``. Analogously, the output
+parameters ``local_o_start`` and ``local_no`` contain the size and the
+offset of the local output array.
 
 Afterward, the input and output arrays must be allocated. Hereby,
 
@@ -72,13 +83,14 @@ Afterward, the input and output arrays must be allocated. Hereby,
 
     pfft_complex* pfft_alloc_complex(size_t size);
 
-is a simple wrapper of , which in turn allocates the memory via to
-ensure proper alignment for SIMD. Have a look at the FFTW user manual 
-for more details on SIMD memory alignment and . Nevertheless, you can
-also use any other dynamic memory allocation.
+is a simple wrapper of ``fftw_alloc_complex``, which in turn allocates
+the memory via ``fftw_malloc`` to ensure proper alignment for SIMD. Have
+a look at the FFTW user manual  for more details on SIMD memory
+alignment and ``fftw_malloc``. Nevertheless, you can also use any other
+dynamic memory allocation.
 
-The planning of a single three-dimensional parallel FFT of size x x is
-done by the function
+The planning of a single three-dimensional parallel FFT of size ``n[0]``
+x ``n[1]`` x ``n[2]`` is done by the function
 
 ::
 
@@ -86,20 +98,22 @@ done by the function
         ptrdiff_t *n, pfft_complex *in, pfft_complex *out,
         MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags);
 
-We provide the address of the input and output array by the pointers and
-, respectively. An inplace transform is assumed if these pointers are
-equal. The integer gives the sign in the exponential of the FFT.
-Possible values are (:math:`-1`) and (:math:`+1`). Flags passed to the
-planner via must coincide with the flags that were passed to . Otherwise
-the data layout of the parallel execution may not match calculated local
-array sizes. As return value we get a PFFT plan, some structure that
-stores all the information needed to perform a parallel FFT.
-
-Once the plan is generated, we are allowed to fill the input array .
-Note, that per default the planning step will overwrite input array .
-Therefore, you should not write any sensitive data into until the plan
-was generated. For simplicity, our test program makes use of the library
-function
+We provide the address of the input and output array by the pointers
+``in`` and ``out``, respectively. An inplace transform is assumed if
+these pointers are equal. The integer ``sign`` gives the sign in the
+exponential of the FFT. Possible values are ``PFFT_FORWARD``
+(:math:`-1`) and ``PFFT_BACKWARD`` (:math:`+1`). Flags passed to the
+planner via ``pfft\_flags`` must coincide with the flags that were
+passed to ``pfft_local_size_3d``. Otherwise the data layout of the
+parallel execution may not match calculated local array sizes. As return
+value we get a PFFT plan, some structure that stores all the information
+needed to perform a parallel FFT.
+
+Once the plan is generated, we are allowed to fill the input array
+``in``. Note, that per default the planning step ``pfft_plan_dft_3d``
+will overwrite input array ``in``. Therefore, you should not write any
+sensitive data into ``in`` until the plan was generated. For simplicity,
+our test program makes use of the library function
 
 ::
 
@@ -108,8 +122,8 @@ function
         pfft_complex *in);
 
 to fill the input array with some numbers. Alternatively, one can fill
-the array with a function of choice and the following loop that takes
-account of the parallel data layout
+the array with a function ``func`` of choice and the following loop that
+takes account of the parallel data layout
 
 ::
 
@@ -127,9 +141,9 @@ The parallel FFT is computed when we execute the generated plan via
 
     void pfft_execute(const pfft_plan plan);
 
-Now, the results can be read from with an analogous three-dimensional
-loop. If we do not want to execute another parallel FFT of the same
-type, we free the allocated memory of the plan with
+Now, the results can be read from ``out`` with an analogous
+three-dimensional loop. If we do not want to execute another parallel
+FFT of the same type, we free the allocated memory of the plan with
 
 ::
 
@@ -141,13 +155,14 @@ Additionally, we use
 
     int MPI_Comm_free(MPI_Comm *comm);  
 
-to free the communicator allocated by and
+to free the communicator allocated by ``pfft_create_procmesh_2d`` and
 
 ::
 
     void pfft_free(void *ptr);
 
-to free memory allocated by . Finally, we exit MPI via
+to free memory allocated by ``pfft_alloc_complex``. Finally, we exit MPI
+via
 
 ::
 
@@ -202,22 +217,24 @@ Listing [lst:pfft\ :sub:`3`\ don1d].
         MPI_Finalize();
     }
 
-substitute by
+substitute ``fftw3-mpi.h`` by ``pfft.h``
 
-substitute all prefixes and by
+substitute all prefixes ``fftw_`` and ``fftw_mpi_`` by ``pfft_``
 
-substitute all prefixes by
+substitute all prefixes ``FFTW_`` by ``PFFT_``
 
-the integers , , become arrays of length 3
+the integers ``N``, ``local_n0``, ``local_0_start`` become arrays of
+length 3
 
-in
+``dft_`` in ``pfft_local_size_dft_3d``
 
-has additional input and additional outputs ,
+``pfft_local_size_dft_3d`` has additional input ``pfft_flags`` and
+additional outputs ``local_no``, ``local_o_start``
 
-The loop that inits becomes splitted along all three dimensions. We
-could also use
+The loop that inits ``data`` becomes splitted along all three
+dimensions. We could also use
 
-First, All prefixes are substituted by
+First, All prefixes ``fftw_`` are substituted by ``pfft_``
 
 Now, the changes in order to use a two-dimensional process mesh are
 marginal as can be seen in Listing [lst:pfft\ :sub:`3`\ don2d].
@@ -281,13 +298,14 @@ As we have seen the function
         MPI_Comm comm, int np0, int np1,
         MPI_Comm *comm_cart_2d);
 
-creates a two-dimensional, periodic, Cartesian communicator. The return
-value (not used in Listing [lst:man\ :sub:`c`\ 2c]) is the forwarded
-error code of . It is equal to zero if the communicator was created
-successfully. The most common error is that the number of processes
-within the input communicator does not fit . In this case the Cartesian
-communicator is not generated and the return value is unequal to zero.
-Therefore, a typical sanity check might look like
+creates a two-dimensional, periodic, Cartesian communicator. The ``int``
+return value (not used in Listing [lst:man\ :sub:`c`\ 2c]) is the
+forwarded error code of ``MPI_Cart_create``. It is equal to zero if the
+communicator was created successfully. The most common error is that the
+number of processes within the input communicator ``comm`` does not fit
+``np0 x np1``. In this case the Cartesian communicator is not generated
+and the return value is unequal to zero. Therefore, a typical sanity
+check might look like
 
 ::
 
@@ -311,9 +329,9 @@ Hereby, we use the PFFT library function
         MPI_Comm comm, FILE *stream, const char *format, ...);
 
 to print the error message. This function is similar to the standard C
-function with the exception, that only the process with MPI rank
-:math:`0` within the given communicator will produce some output; see
-Section [sec:fprintf] for details.
+function ``fprintf`` with the exception, that only the process with MPI
+rank :math:`0` within the given communicator ``comm`` will produce some
+output; see Section [sec:fprintf] for details.
 
 Inplace transforms
 ------------------
@@ -323,9 +341,9 @@ place, which means that beside some constant buffers, no second data
 array is necessary. Especially, the global data communication can be
 performed in place. As far as we know, there is no other parallel FFT
 library beside FFTW and PFFT that supports this feature. This feature is
-enabled as soon as the pointer to the output array is equal to the
-pointer to the input array . E.g., in Listing [lst:man\ :sub:`c`\ 2c] we
-would call
+enabled as soon as the pointer to the output array ``out`` is equal to
+the pointer to the input array ``in``. E.g., in
+Listing [lst:man\ :sub:`c`\ 2c] we would call
 
 ::
 
@@ -360,8 +378,8 @@ Therefore, we present a generalization of communicator creation function
         int rnk_np, MPI_Comm comm, const int *np,
         MPI_Comm *comm_cart);
 
-Hereby, the array of length gives the size of the Cartesian communicator
-.
+Hereby, the array ``np`` of length ``rnk_np`` gives the size of the
+Cartesian communicator ``cart_comm``.
 
 Parallel data decomposition
 ---------------------------
@@ -370,10 +388,11 @@ In the following, we use the notation :math:`\frac{n}{P}` to symbolize
 that an array of length :math:`n` is broken into disjoint blocks and
 distributed on :math:`P` MPI processes. Hereby, the data is distributed
 in compliance to the FFTW-MPI data decompostion , i.e., the first
-(rounded down) processes get a contiguous chunk of elements, the next
-process gets the remaining data elements, and all remaining processes
-get nothing. Thereby, the block size defaults to (rounded down) but can
-also be user defined.
+``P/block`` (rounded down) processes get a contiguous chunk of ``block``
+elements, the next process gets the remaining ``n - block * (n/block)``
+data elements, and all remaining processes get nothing. Thereby, the
+block size ``block`` defaults to ``n/P`` (rounded down) but can also be
+user defined.
 
 Non-transposed and transposed data layout
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -382,29 +401,32 @@ In the following, we use the notation :math:`\frac{n}{P}` to symbolize
 that an array of length :math:`n` is distributed on :math:`P` MPI
 processes. The standard PFFT data decomposition of :math:`h` interleaved
 :math:`d`-dimensional arrays of equal size
-:math:`n_0 \times n_1\times \hdots \times n_{d-1}` on a
+:math:`n_0 \times n_1\times \dots \times n_{d-1}` on a
 :math:`r`-dimensional process mesh of size
-:math:`P_0\times \hdots \times P_{r-1}` is given by the blocks
+:math:`P_0\times \dots \times P_{r-1}` is given by the blocks
 
-.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \hdots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \hdots \times n_{d-1} \times h.
+.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \dots \times \frac{n_{r-1}}{P_{r-1}}  \times n_r \times n_{r+1} \times \dots \times n_{d-1} \times h.
 
-A PFFT created with planning flag requires the inputs to be decomposed
-in this standard way and produces outputs that are decomposed in the
-same way.
+A PFFT created with planning flag ``PFFT_TRANSPOSED_NONE`` requires the
+inputs to be decomposed in this standard way and produces outputs that
+are decomposed in the same way.
 
 PFFT can save half of the global communication amount, if the data
 reordering to standard decomposition is omitted. The transposed data
 decomposition is given by
 
-.. math:: \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \hdots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \hdots \times n_{d-1} \times h
+.. math:: \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \dots \times \frac{n_{r}}{P_{r-1}}  \times n_0 \times n_{r+1} \times \dots \times n_{d-1} \times h
 
-A PFFT plan created with planning flag produces outputs with transposed
-data decomposition. Analogously, a PFFT plan created with planning flag
-requires its inputs to be decomposed in the transposed way. Typically,
-one creates a forward plan with and a backward plan with planning flag .
+A PFFT plan created with planning flag ``PFFT_TRANSPOSED_OUT`` produces
+outputs with transposed data decomposition. Analogously, a PFFT plan
+created with planning flag ``PFFT_TRANSPOSED_IN`` requires its inputs to
+be decomposed in the transposed way. Typically, one creates a forward
+plan with ``PFFT_TRANSPOSED_OUT`` and a backward plan with planning flag
+``PFFT_TRANSPOSED_IN``.
 
-Note that the flags and must be passed to the array distribution
-function (see Section [sec:local-size]) *as well as* to the planner (see
+Note that the flags ``PFFT_TRANSPOSED_OUT`` and ``PFFT_TRANSPOSED_IN``
+must be passed to the array distribution function (see
+Section [sec:local-size]) *as well as* to the planner (see
 Section [sec:create-plan]).
 
 Three-dimensional FFTs with three-dimensional data decomposition
@@ -427,23 +449,27 @@ and compute the parallel FFT with this two-dimensional decomposition.
 Note that the 3d to 2d remap implies some very special restrictions on
 the block sizes for :math:`n_0` and :math:`n_1`, i.e., the blocks must
 be divisible by :math:`Q_0` and :math:`Q_1`. More precisely, the default
-blocks of the 2d-decomposition are given by and (both divisions rounded
-down). This implies that the default blocks of the 3d-decomposition must
-be , , and (all divisions rounded down).
+blocks of the 2d-decomposition are given by ``n0/(P0*Q0)`` and
+``n1/(P1*Q1)`` (both divisions rounded down). This implies that the
+default blocks of the 3d-decomposition must be ``n0/(P0*Q0) * Q0``,
+``n1/(P1*Q1) * Q1``, and ``n2/(Q0*Q1)`` (all divisions rounded down).
 
 Planning effort
 ---------------
 
 Pass one of the following flags
 
-,
+``PFFT_ESTIMATE``,
+
+``PFFT_MEASURE``,
 
-,
+``PFFT_PATIENT``, or,
 
-, or,
+``PFFT_EXHAUSIVE``
 
-to the PFFT planner in order to plan all internal FFTW plans with , , ,
-or , respectively. The default value is .
+to the PFFT planner in order to plan all internal FFTW plans with
+``FFTW_ESTIMATE``, ``FFTW_MEASURE``, ``FFTW_PATIENT``, or
+``FFTW_EXHAUSIVE``, respectively. The default value is ``PFFT_MEASURE``.
 
 PFFT uses FFTW plans for parallel array transposition and the serial
 transforms. In fact, every serial transform is a combination of strided
@@ -451,40 +477,49 @@ lower-dimensional FFTs and a serial array transposition (necessary to
 prepare the global transposition) which can be done by a single FFTW
 plan. However, it turns out that FFTW sometimes performs better if the
 serial transposition and the strided FFTs are executed separately.
-Therefore, PFFT introduces the flag that enables extensive run time
-tests in order to find the optimal sequence of serial strided FFT and
-serial transposition for every serial transform. These tests are disable
-on default which corresponds to the flag .
+Therefore, PFFT introduces the flag ``PFFT_TUNE`` that enables extensive
+run time tests in order to find the optimal sequence of serial strided
+FFT and serial transposition for every serial transform. These tests are
+disable on default which corresponds to the flag ``PFFT_NO_TUNE``.
 
 Preserving input data
 ---------------------
 
 The following flags
 
-,
+``PFFT_PRESERVE_INPUT``,
 
-, and,
+``PFFT_DESTROY_INPUT``, and,
+
+``PFFT_BUFFERED_INPLACE``
 
 only take effect for out-of-place transforms. The first one behaves
-analogously to the FFTW flag and ensures that the input values are not
-overwritten. In fact, this flag implies that only the first serial
-transform is executed out-of-place and all successive steps are
-performed in-place on the output array. In compliance to FFTW, this is
-the default behaviour for out-of-place plans.
-
-The second flag behaves analogously to the FFTW flag and tells the
-planner that the input array can be used as scratch array. This may give
-some speedup for out-of-place plans, because all the intermediate
-transforms and transposition steps can be performed out-of-place.
-
-Finally, the flag can be used for out-of-place plans that store its
-inputs and outputs in the same array, i.e., array is used for
-intermediate out-of-place transforms and transpositions but the PFFT
-inputs and outputs are stored in array .
+analogously to the FFTW flag ``FFTW_PRESERVE_INPUT`` and ensures that
+the input values are not overwritten. In fact, this flag implies that
+only the first serial transform is executed out-of-place and all
+successive steps are performed in-place on the output array. In
+compliance to FFTW, this is the default behaviour for out-of-place
+plans.
+
+The second flag behaves analogously to the FFTW flag
+``FFTW_DESTROY_INPUT`` and tells the planner that the input array can be
+used as scratch array. This may give some speedup for out-of-place
+plans, because all the intermediate transforms and transposition steps
+can be performed out-of-place.
+
+Finally, the flag ``PFFT_BUFFERED_INPLACE`` can be used for out-of-place
+plans that store its inputs and outputs in the same array, i.e., array
+``out`` is used for intermediate out-of-place transforms and
+transpositions but the PFFT inputs and outputs are stored in array
+``in``.
 
 FFTs with shifted index sets
 ----------------------------
 
+``PFFT_SHIFTED_IN``
+
+``PFFT_SHIFTED_OUT``
+
 Pruned FFT and Shifted Index Sets
 ---------------------------------
 
@@ -493,16 +528,18 @@ Pruned FFT
 
 For pruned r2r- and c2c-FFT are defined as
 
-.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k \eim{kl/n}, \quad l=0,\hdots,n_o-1,
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n_o-1,
 
 where :math:`n_i\le n` and :math:`n_o\le n`.
 
 Shifted Index Sets
 ~~~~~~~~~~~~~~~~~~
 
-For :math:`N\in 2\N` we define the FFT with shifted inputs
+For :math:`N\in 2{\ensuremath{\mathbb{N}}}` we define the FFT with
+shifted inputs
 
-For :math:`K,L,N\in 2\N`, :math:`L<N`, :math:`L<N` we define
+For :math:`K,L,N\in 2{\ensuremath{\mathbb{N}}}`, :math:`L<N`,
+:math:`L<N` we define
 
 Precisions
 ----------
@@ -514,18 +551,21 @@ You can install single and long-double precision versions of PFFT, which
 replace double with float and long double, respectively; see
 [sec:install]. To use these interfaces, you must
 
-Link to the single/long-double libraries; on Unix, or instead of (or in
-addition to) . (You can link to the different-precision libraries
-simultaneously.)
+Link to the single/long-double libraries; on Unix, ``-lpfftf`` or
+``-lpfftl`` instead of (or in addition to) ``-lpfft``. (You can link to
+the different-precision libraries simultaneously.)
 
-Include the same header file.
+Include the same ``<pfft.h>`` header file.
 
-Replace all lowercase instances of ‘’ with ‘’ or ‘’ for single or
-long-double precision, respectively. ( becomes , becomes , etcetera.)
+Replace all lowercase instances of ‘``pfft_``’ with ‘``pfftf_``’ or
+‘``pfftl_``’ for single or long-double precision, respectively.
+(``pfft_complex`` becomes ``pfftf_complex``, ``pfft_execute`` becomes
+``pfftf_execute``, etcetera.)
 
-Uppercase names, i.e. names beginning with ‘’, remain the same.
+Uppercase names, i.e. names beginning with ‘``PFFT_``’, remain the same.
 
-Replace with or for subroutine parameters.
+Replace ``double`` with ``float`` or ``long double`` for subroutine
+parameters.
 
 Ghost cell communication
 ------------------------

From 7762261b4e6e2dfbe203f83b5447f5acf89014ce Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 13 Sep 2015 12:32:36 -0700
Subject: [PATCH 5/6] Fix latexpdf build and better HTML math

make -f Makefile-sphinx latexpdf html
---
 doc/conf.py       |  6 +++++-
 doc/shortcuts.tex | 40 ++++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index acaf658..38b441b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -215,7 +215,11 @@
 #'pointsize': '10pt',
 
 # Additional stuff for the LaTeX preamble.
-#'preamble': '',
+'preamble': r"""
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{nicefrac}
+""",
 
 # Latex figure (float) alignment
 #'figure_align': 'htbp',
diff --git a/doc/shortcuts.tex b/doc/shortcuts.tex
index c434946..d02b976 100644
--- a/doc/shortcuts.tex
+++ b/doc/shortcuts.tex
@@ -6,23 +6,23 @@
 
 
 % Shortcuts for math symbols.
-\newcommand{\N}{\ensuremath{\mathbb{N}}}
-\newcommand{\T}{\ensuremath{\mathbb{T}}}
-\renewcommand{\S}{\ensuremath{\mathbb{S}}}
-\newcommand{\NZ}{\ensuremath{\mathbb{N}_{0}}}
-\newcommand{\Z}{\ensuremath{\mathbb{Z}}}
-\newcommand{\R}{\ensuremath{\mathbb{R}}}
-\newcommand{\Rp}{\ensuremath{\mathbb{R}_{+}}}
-\newcommand{\Rn}{\ensuremath{\mathbb{R}^n}} 
-\newcommand{\Rnn}{\ensuremath{\mathbb{R}^{n \times n}}}
-\newcommand{\C}{\ensuremath{\mathbb{C}}}
-\newcommand{\cO}{\ensuremath{\mathcal{O}}}
-\newcommand{\tT}{\ensuremath{\text{\tiny{T}}}}
-\newcommand{\ti}{\ensuremath{\text{\scriptsize{i}}}}
-\newcommand{\e}{{\ensuremath{\mathrm{e}}}}
-\newcommand{\eim}[1]{\ensuremath{\mathrm{e}^{-2\pi{\ti} #1}}}
-\newcommand{\eip}[1]{\ensuremath{\mathrm{e}^{+2\pi{\ti} #1}}}
-\renewcommand{\mathbf}[1]{\ensuremath{\boldsymbol{#1}}}
+\newcommand{\N}{{\mathbb{N}}}
+\newcommand{\T}{{\mathbb{T}}}
+\renewcommand{\S}{{\mathbb{S}}}
+\newcommand{\NZ}{{\mathbb{N}_{0}}}
+\newcommand{\Z}{{\mathbb{Z}}}
+\newcommand{\R}{{\mathbb{R}}}
+\newcommand{\Rp}{{\mathbb{R}_{+}}}
+\newcommand{\Rn}{{\mathbb{R}^n}} 
+\newcommand{\Rnn}{{\mathbb{R}^{n \times n}}}
+\newcommand{\C}{{\mathbb{C}}}
+\newcommand{\cO}{{\mathcal{O}}}
+\newcommand{\tT}{{\text{T}}}
+\newcommand{\ti}{{\text{i}}}
+\newcommand{\e}{{{\mathrm{e}}}}
+\newcommand{\eim}[1]{{\mathrm{e}^{-2\pi{\ti} #1}}}
+\newcommand{\eip}[1]{{\mathrm{e}^{+2\pi{\ti} #1}}}
+\renewcommand{\mathbf}[1]{{\boldsymbol{#1}}}
 \newcommand{\ds}{\displaystyle}
 \newcommand{\sinc}{\textrm{sinc}}
 \newcommand{\dist}{\textrm{dist}}
@@ -44,9 +44,9 @@
 {\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
 \newcommand{\bigtimes}{\BIGOP{\times}}
 \def\invisible#1{\textcolor{white}{#1}}
-\newcommand{\Vect}[1]{\ensuremath{\mathbf{#1}}}
-\newcommand{\Mat}[1]{\ensuremath{\mathbf{#1}}}
-\newcommand{\Cal}[1]{\ensuremath{\mathcal{#1}}}
+\newcommand{\Vect}[1]{{\mathbf{#1}}}
+\newcommand{\Mat}[1]{{\mathbf{#1}}}
+\newcommand{\Cal}[1]{{\mathcal{#1}}}
 \newcommand{\fft}{\textsf{FFT}}
 
 \newcommand{\ousetarrow}[2]{\overset{\textsf{#1}}{\underset{\textsf{#2}}{\rightarrow}}}

From 188dcc52b934029104ad546bda359cf3ff920f67 Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 13 Sep 2015 12:33:14 -0700
Subject: [PATCH 6/6] udpated rst files.

---
 doc/features.rst  | 54 +++++++++++++++++++++++------------------------
 doc/reference.rst | 11 +++++-----
 doc/tutorial.rst  |  8 +++----
 3 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/doc/features.rst b/doc/features.rst
index 98eb675..f381495 100644
--- a/doc/features.rst
+++ b/doc/features.rst
@@ -8,8 +8,8 @@ Advanced Features
 How to Deal with FFT Index Shifts in Parallel
 ---------------------------------------------
 
-Let :math:`n\in2{\ensuremath{\mathbb{N}}}`. A common problem is that the
-index of the FFT input and/or output array runs between
+Let :math:`n\in2{{\mathbb{N}}}`. A common problem is that the index of
+the FFT input and/or output array runs between
 :math:`-\nicefrac n2,\dots,\nicefrac n2-1`, but the FFT library requires
 them to run between :math:`0,\dots,n-1`. With serial program execution
 one can easily remap the input data :math:`\hat g_k` in a way that is
@@ -44,10 +44,10 @@ shift the input (``PFFT_SHIFTED_IN``) and/or to shift the output
 
 Here, we are interested in the computation of
 
-.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
+.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
 
-with :math:`n, n_i, n_o \in 2{\ensuremath{\mathbb{N}}}` and
-:math:`n>n_i`, :math:`n>n_o`.
+with :math:`n, n_i, n_o \in 2{{\mathbb{N}}}` and :math:`n>n_i`,
+:math:`n>n_o`.
 
 With an index shift of :math:`\nicefrac n2` both in :math:`k` and
 :math:`l` this equivalent to the computation of
@@ -57,21 +57,21 @@ With an index shift of :math:`\nicefrac n2` both in :math:`k` and
    \begin{aligned}
      g_{(l-\nicefrac{n}{2})}
      &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-        \hat g_{(k-\nicefrac{n}{2})} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k-\nicefrac n2)(l-\nicefrac n2)/n}}} \\
-     &= {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}l} 
+        \hat g_{(k-\nicefrac{n}{2})} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k-\nicefrac n2)(l-\nicefrac n2)/n}}} \\
+     &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}l} 
           \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-          \left(\hat g_{(k-\nicefrac{n}{2})}{{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(k-\nicefrac n2)}\right) {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}} \\
-     &= {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(l-\nicefrac n2)} 
+          \left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac n2)}\right) {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}} \\
+     &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(l-\nicefrac n2)} 
         \underbrace{
           \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1}
-          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}{{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}k}\right)}_{\hat f_k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}
+          \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}k}\right)}_{\hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}
         }_{f_l}\end{aligned}
 
 for
 :math:` l=\nicefrac n2-\nicefrac{n_o}{2},\dots,\nicefrac n2 +\nicefrac{n_o}{2}-1`.
 Therefore, we get the following algorithm
 
-.. math:: f_l = \sum_{k=0}^n \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
+.. math:: f_l = \sum_{k=0}^n \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1
 
 The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
 corresponds to the shifts the arrays ()
@@ -80,7 +80,7 @@ corresponds to the shifts the arrays ()
 :math:`k=-\nicefrac{n_i}{2},\dots,\nicefrac{n_i}{2}-1` compute
 :math:`\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}`.
 For :math:`l=0,\dots,n-1` compute
-:math:`f_l = \sum_{k=0}^{n} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+:math:`f_l = \sum_{k=0}^{n} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
 using PFFT. For :math:`l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1`
 compute :math:`g_l = (-1)^l f_{(l+n/2)} `.
 
@@ -94,12 +94,12 @@ Arbitrary shifts
 More general shifts must be done by the user.
 
 In a more general setting, we are interested in the computation of FFTs
-with shifted index sets, i.e., assume
-:math:`k_s,l_s\in{\ensuremath{\mathbb{Z}}}` and compute
+with shifted index sets, i.e., assume :math:`k_s,l_s\in{{\mathbb{Z}}}`
+and compute
 
 .. math::
 
-   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}},
+   g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}},
      \quad l=l_s,\dots,n_o+l_s-1\,.
 
 Because of the periodicity of the FFT this can be easily performed by
@@ -110,7 +110,7 @@ Alg. [alg:fftshift:sub:`t`\ ranslation].
 [1] =1.1ex For :math:`k=0,\dots,n_i-1` assign
 :math:`\hat f_k = \hat g_{(k+k_s\bmod n_i)}`. For
 :math:`l=0,\dots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
 using PFFT. For :math:`l=0,\dots,n_o-1` assign
 :math:`g_l = f_{(l-l_s\bmod n_o)}`.
 
@@ -124,11 +124,11 @@ computation of
    \begin{aligned}
      g_{l+l_s}
      &=
-       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k(l+l_s)/n}}}
+       \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} k(l+l_s)/n}}}
        =
-       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)(l+l_s)/n}}} \\
+       \sum_{k=0}^{n_i-1} \hat g_{k+k_s} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)(l+l_s)/n}}} \\
      &=
-       {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k_sl/n}}} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}{\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)l_s/n}}}\right)}_{=: \hat f_k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}\end{aligned}
+       {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}{{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}\right)}_{=: \hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}\end{aligned}
 
 for all :math:`l=0,\dots,n_o-1`. The resulting
 Alg. [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at
@@ -137,33 +137,33 @@ the price of some extra computation.
 [alg:fftshift:sub:`m`\ odulation]
 
 [1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
-:math:`\hat f_k = \hat g_{(k+k_s)} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} (k+k_s)l_s/n}}}`.
+:math:`\hat f_k = \hat g_{(k+k_s)} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}`.
 For :math:`l=0,\dots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
 using PFFT. For :math:`l=0,\dots,n_o-1` compute
-:math:`g_{(l+l_s)} = f_l {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} k_sl/n}}}`.
+:math:`g_{(l+l_s)} = f_l {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}}`.
 
 The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}`
 corresponds to the shifts the arrays ()
 
 [1] =1.1ex For :math:`k=0,\dots,n_i-1` compute
-:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}(k-\nicefrac{n_i}{2})n_o/n}`.
+:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac{n_i}{2})n_o/n}`.
 For :math:`l=0,\dots,n_o-1` compute
-:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}`
+:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}`
 using PFFT. For :math:`l=0,\dots,n_o-1` compute
-:math:`g_{(l-\nicefrac{n_o}{2})} = f_l {{\ensuremath{\mathrm{e}}}}^{+\pi{\ensuremath{\text{\scriptsize{i}}}}n_i l/n}`.
+:math:`g_{(l-\nicefrac{n_o}{2})} = f_l {{{\mathrm{e}}}}^{+\pi{{\text{i}}}n_i l/n}`.
 
 Parallel pruned FFT
 -------------------
 
 Within PFFT we define a pruned FFT as
 
-.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n_o-1.
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1.
 
 Formally, this is equivallent to the following regular size :math:`n`
 FFT
 
-.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n,
+.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n,
 
 with
 
diff --git a/doc/reference.rst b/doc/reference.rst
index 0933d542..08a6e39 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -1377,12 +1377,11 @@ decribed in Section [sec:init-data-3d-r2r]. However, generating suitable
 inputs for a c2r transform requires more caution. In order to get real
 valued results of a DFT the complex input coefficients need to satisfy
 an radial Hermitian symmetry, i.e.,
-:math:`X[{\ensuremath{\boldsymbol{k}}}] = {X^*[-{\ensuremath{\boldsymbol{k}}}]}`.
-We use the following trick to generate the complex input values for c2r
-transforms. Assume any :math:`{\ensuremath{\boldsymbol{N}}}`-periodic
-complex valued function :math:`f`. It can be easily shown that the
-values
-:math:`X[{\ensuremath{\boldsymbol{k}}}] := \frac{1}{2}\left(f({\ensuremath{\boldsymbol{k}}})+f^*(-{\ensuremath{\boldsymbol{k}}})\right)`
+:math:`X[{{\boldsymbol{k}}}] = {X^*[-{{\boldsymbol{k}}}]}`. We use the
+following trick to generate the complex input values for c2r transforms.
+Assume any :math:`{{\boldsymbol{N}}}`-periodic complex valued function
+:math:`f`. It can be easily shown that the values
+:math:`X[{{\boldsymbol{k}}}] := \frac{1}{2}\left(f({{\boldsymbol{k}}})+f^*(-{{\boldsymbol{k}}})\right)`
 satisfy the radial Hermitian symmetry.
 
 To fill a complex array ``data`` with reproducible, complex values that
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index d862765..cb65b46 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -528,18 +528,16 @@ Pruned FFT
 
 For pruned r2r- and c2c-FFT are defined as
 
-.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k {\ensuremath{\mathrm{e}^{-2\pi{{\ensuremath{\text{\scriptsize{i}}}}} kl/n}}}, \quad l=0,\dots,n_o-1,
+.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1,
 
 where :math:`n_i\le n` and :math:`n_o\le n`.
 
 Shifted Index Sets
 ~~~~~~~~~~~~~~~~~~
 
-For :math:`N\in 2{\ensuremath{\mathbb{N}}}` we define the FFT with
-shifted inputs
+For :math:`N\in 2{{\mathbb{N}}}` we define the FFT with shifted inputs
 
-For :math:`K,L,N\in 2{\ensuremath{\mathbb{N}}}`, :math:`L<N`,
-:math:`L<N` we define
+For :math:`K,L,N\in 2{{\mathbb{N}}}`, :math:`L<N`, :math:`L<N` we define
 
 Precisions
 ----------