diff --git a/doc/Makefile-sphinx b/doc/Makefile-sphinx new file mode 100644 index 0000000..33668b7 --- /dev/null +++ b/doc/Makefile-sphinx @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PFFT.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PFFT.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/PFFT" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PFFT" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..38b441b --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +# +# PFFT documentation build configuration file, created by +# sphinx-quickstart on Sun Sep 13 01:20:34 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.todo', + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'PFFT' +copyright = u'2015, Michael Pippig' +author = u'Michael Pippig' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.0.8' +# The full version, including alpha/beta/rc tags. +release = '1.0.8' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'nature' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'PFFTdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +'preamble': r""" +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{nicefrac} +""", + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'PFFT.tex', u'PFFT Documentation', + u'Michael Pippig', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pfft', u'PFFT Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'PFFT', u'PFFT Documentation', + author, 'PFFT', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/doc/convert.sh b/doc/convert.sh new file mode 100644 index 0000000..242a9eb --- /dev/null +++ b/doc/convert.sh @@ -0,0 +1,11 @@ +files=*.tex +for i in $files; do + # cat $i | sed 's/\\code{\([^}]*\)}/{\\small \1}/g' | + echo $i + cat preample.tex $i | sed \ + -e 's;\hdots;\dots;g' \ + -e 's/\\code{\([^}]*\)}/\\verb+\1+/g' \ + | pandoc -f latex -t rst \ + > ${i//.tex}.rst +done + diff --git a/doc/develop.rst b/doc/develop.rst new file mode 100644 index 0000000..4d4830d --- /dev/null +++ b/doc/develop.rst @@ -0,0 +1,44 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Developers Guide +================ + +Search and replace patterns +--------------------------- + +Correct alignment of pfft.h header + +:: + + %s/^\( [^ ]\+[^\\]*\) \\/ \1\\/g + +Expand most macros of pfft.h to generate the function reference of this +manual: + +:: + + sed -e 's/ *\\$//g' -e 's/PFFT_EXTERN //g' \ + -e 's/PX(\([^)]*\))/pfft_\1/g' -e 's/ INT/ ptrdiff_t/g' \ + -e 's/ R/ double/g' -e 's/ C/ pfft_complex/g' \ + -e 's/^ //g' pfft.h > pfft.h.expanded + +ToDo +==== + +- ``PFFT_FORWARD`` is defined as ``FFTW_FORWARD`` + +- ``FFTW_FORWARD`` is defined as :math:`-1` + +- PFFT allows to chose between ``FFTW_FORWARD`` and ``FFTW_BACKWARD``, + which is not implemented by FFTW. + +- Matlab uses the same sign convention, i.e., :math:`-1` for ``fft`` + and :math:`+1` for ``ifftn`` + +Measuring parallel run times +---------------------------- + +Use ``MPI_Barrier`` in front of every call to ``pfft_`` function to +avoid unbalanced run times. diff --git a/doc/features.rst b/doc/features.rst new file mode 100644 index 0000000..f381495 --- /dev/null +++ b/doc/features.rst @@ -0,0 +1,183 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Advanced Features +================= + +How to Deal with FFT Index Shifts in Parallel +--------------------------------------------- + +Let :math:`n\in2{{\mathbb{N}}}`. A common problem is that the index of +the FFT input and/or output array runs between +:math:`-\nicefrac n2,\dots,\nicefrac n2-1`, but the FFT library requires +them to run between :math:`0,\dots,n-1`. With serial program execution +one can easily remap the input data :math:`\hat g_k` in a way that is +suitable for the library, i.e., + +.. math:: \hat f_k := \hat g_{(k-\nicefrac n2\bmod n)}, \quad k = 0,\dots,n-1. + +Similarly, one could remap the outputs of the library :math:`f_l`, +:math:`l=0,\cdots,n-1` in the opposite direction in order to get the +required outputs, i.e., + +.. math:: g_l := f_{l \bmod n}, \quad l = -\nicefrac n2,\dots,\nicefrac n2-1. + +These shifts are also known as ``fftshift`` in Matlab. + +However, with distributed memory these ``fftshift`` operations require +more complex data movements and result in a global communication. For +example, the first index of the array moves to the middle and, +therefore, the corresponding data move to another MPI process. +Fortunately, this communication can be avoided at the cost of little +extra computation. At the end of the section we present two PFFT library +functions that perform the necessary pre- and postprocessing for shifted +input and output index sets. + +Shift with half the FFT size +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The special case of input shift :math:`k_s=-\nicefrac n2` and/or output +shift :math:`l_s=-\nicefrac n2` is supported by PFFT. User can choose to +shift the input (``PFFT_SHIFTED_IN``) and/or to shift the output +(``PFFT_SHIFTED_OUT``). + +Here, we are interested in the computation of + +.. math:: g_l = \sum_{k=-\nicefrac{n_i}{2}}^{\nicefrac{n_i}{2}-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1 + +with :math:`n, n_i, n_o \in 2{{\mathbb{N}}}` and :math:`n>n_i`, +:math:`n>n_o`. + +With an index shift of :math:`\nicefrac n2` both in :math:`k` and +:math:`l` this equivalent to the computation of + +.. math:: + + \begin{aligned} + g_{(l-\nicefrac{n}{2})} + &= \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1} + \hat g_{(k-\nicefrac{n}{2})} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k-\nicefrac n2)(l-\nicefrac n2)/n}}} \\ + &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}l} + \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1} + \left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac n2)}\right) {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}} \\ + &= {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(l-\nicefrac n2)} + \underbrace{ + \sum_{k=\nicefrac{n}{2}-\nicefrac{n_i}{2}}^{\nicefrac{n}{2}+\nicefrac{n_i}{2}-1} + \underbrace{\left(\hat g_{(k-\nicefrac{n}{2})}{{{\mathrm{e}}}}^{+\pi{{\text{i}}}k}\right)}_{\hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}} + }_{f_l}\end{aligned} + +for +:math:` l=\nicefrac n2-\nicefrac{n_o}{2},\dots,\nicefrac n2 +\nicefrac{n_o}{2}-1`. +Therefore, we get the following algorithm + +.. math:: f_l = \sum_{k=0}^n \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1 + +The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}` +corresponds to the shifts the arrays () + +[1] =1.1ex For :math:`k=0,\dots,n-1` set :math:`\hat f_k = 0`. For +:math:`k=-\nicefrac{n_i}{2},\dots,\nicefrac{n_i}{2}-1` compute +:math:`\hat f_{(k+\nicefrac{n}{2})} = (-1)^{(k+\nicefrac{n}{2})} \hat g_{k}`. +For :math:`l=0,\dots,n-1` compute +:math:`f_l = \sum_{k=0}^{n} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}` +using PFFT. For :math:`l=-\nicefrac{n_o}{2},\dots,\nicefrac{n_o}{2}-1` +compute :math:`g_l = (-1)^l f_{(l+n/2)} `. + +Note, that this shift implies that the library deals with pruned FFTs in +a special way, i.e., half of the zeros are added at the beginning of the +inputs and the other half is added at the end. + +Arbitrary shifts +~~~~~~~~~~~~~~~~ + +More general shifts must be done by the user. + +In a more general setting, we are interested in the computation of FFTs +with shifted index sets, i.e., assume :math:`k_s,l_s\in{{\mathbb{Z}}}` +and compute + +.. math:: + + g_l = \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, + \quad l=l_s,\dots,n_o+l_s-1\,. + +Because of the periodicity of the FFT this can be easily performed by +Alg. [alg:fftshift:sub:`t`\ ranslation]. + +[alg:fftshift:sub:`t`\ ranslation] + +[1] =1.1ex For :math:`k=0,\dots,n_i-1` assign +:math:`\hat f_k = \hat g_{(k+k_s\bmod n_i)}`. For +:math:`l=0,\dots,n_o-1` compute +:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}` +using PFFT. For :math:`l=0,\dots,n_o-1` assign +:math:`g_l = f_{(l-l_s\bmod n_o)}`. + +However, this involves explicit data movement since the sequence of data +changes. For a our parallel data decomposition the change of data layout +requires data communication. A simple index shift results in the +computation of + +.. math:: + + \begin{aligned} + g_{l+l_s} + &= + \sum_{k=k_s}^{n_i+k_s-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} k(l+l_s)/n}}} + = + \sum_{k=0}^{n_i-1} \hat g_{k+k_s} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)(l+l_s)/n}}} \\ + &= + {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}} \sum_{k=0}^{n_i-1} \underbrace{\left(\hat g_{k+k_s}{{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}\right)}_{=: \hat f_k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}\end{aligned} + +for all :math:`l=0,\dots,n_o-1`. The resulting +Alg. [alg:fftshift:sub:`m`\ odulation] preserves the sequence of data at +the price of some extra computation. + +[alg:fftshift:sub:`m`\ odulation] + +[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute +:math:`\hat f_k = \hat g_{(k+k_s)} {{\mathrm{e}^{-2\pi{{{\text{i}}}} (k+k_s)l_s/n}}}`. +For :math:`l=0,\dots,n_o-1` compute +:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}` +using PFFT. For :math:`l=0,\dots,n_o-1` compute +:math:`g_{(l+l_s)} = f_l {{\mathrm{e}^{-2\pi{{{\text{i}}}} k_sl/n}}}`. + +The special case :math:`k_s=-\frac{n_i}{2}, l_s=-\frac{n_o}{2}` +corresponds to the shifts the arrays () + +[1] =1.1ex For :math:`k=0,\dots,n_i-1` compute +:math:`\hat f_k = \hat g_{(k-\nicefrac{n_i}{2})} {{{\mathrm{e}}}}^{+\pi{{\text{i}}}(k-\nicefrac{n_i}{2})n_o/n}`. +For :math:`l=0,\dots,n_o-1` compute +:math:`f_l = \sum_{k=0}^{n_i} \hat f_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}` +using PFFT. For :math:`l=0,\dots,n_o-1` compute +:math:`g_{(l-\nicefrac{n_o}{2})} = f_l {{{\mathrm{e}}}}^{+\pi{{\text{i}}}n_i l/n}`. + +Parallel pruned FFT +------------------- + +Within PFFT we define a pruned FFT as + +.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1. + +Formally, this is equivallent to the following regular size :math:`n` +FFT + +.. math:: f_l = \sum_{k=0}^{n-1} \hat f_{k} {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n, + +with + +.. math:: + + \hat g_k := + \begin{cases} + \hat f_k, &: k=0,\dots,n_1-1, \\ + 0 &: k=n_i,\dots,n-1, + \end{cases} + +and :math:`f_l := g_l`, :math:`k=0,\dots,n_o-1`. I.e., we add +:math:`n-n_i` zeros at the end of the input array and throw away +:math:`n-n_o` entries at the end of the output array. + +The definition of pruned FFT changes for ``PFFT_SHIFTED_IN`` and +``PFFT_SHIFTED_OUT``. diff --git a/doc/fortran.rst b/doc/fortran.rst new file mode 100644 index 0000000..864aa83 --- /dev/null +++ b/doc/fortran.rst @@ -0,0 +1,8 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Fotran Interface +================ + +based on Fortran 90 diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..2e0d8e2 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,30 @@ +.. PFFT documentation master file, created by + sphinx-quickstart on Sun Sep 13 01:20:34 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to PFFT's documentation! +================================ + +Contents: + +.. toctree:: + :maxdepth: 2 + + intro + tutorial + install + features + interface + reference + develop + + + +Indices and tables +================== + +* ::ref:`genindex` +* ::ref:`modindex` +* ::ref:`search` + diff --git a/doc/install.rst b/doc/install.rst new file mode 100644 index 0000000..7eb1370 --- /dev/null +++ b/doc/install.rst @@ -0,0 +1,151 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Installation and linking +======================== + +The install of PFFT is based on the Autotools and follows the typical +workflow + +:: + + ./configure + make + make install + +Install of the latest official FFTW release +------------------------------------------- + +PFFT depends on Release 3.3.3 of the FFTW library . For the sake of +completeness, we show the command line based install procedure in the +following. However, note that we provide install scripts on +`{www.tu-chemnitz.de/~mpip}/software.php <{www.tu-chemnitz.de/~mpip}/software.php>`__that +simplify the install a lot. We highly recommend to use these install +scripts, since they additionally apply several performance patches and +bugfixes that have been submitted to the FFTW developers but are not yet +included in the official FFTW releases. + +:: + + wget http://www.fftw.org/fftw-§\fftwversionsl§.tar.gz + tar xzvf fftw-§\fftwversion§.tar.gz + cd fftw-§\fftwversion§ + ./configure --enable-mpi --prefix=$HOME/local/fftw3_mpi §\label{lst:fftw:conf}§ + make + make install + +The MPI algorithms of FFTW must be build with a MPI C compiler. Add the +statement ``MPICC=\$MPICCOMP`` at the end of line [lst:fftw:conf] if the +``configure`` script fails to determine the right MPI C compiler +``\$MPICCOMP``. Similarly, the MPI Fortran compiler ``\$MPIFCOMP`` is +set by ``MPIFC=\$MPIFCOMP``. + +Install of the PFFT library +--------------------------- + +In the simplest case, the hardware platform and the -3.3.3 library are +recognized by the PFFT configure script automatically, so all we have to +do is + +:: + + wget http://www.tu-chemnitz.de/~mpip/software/pfft-§\pfftversionsl§.tar.gz + tar xzvf pfft-§\pfftversion§.tar.gz + cd pfft-§\pfftversion§ + ./configure + make + make check + make install + +Hereby, the optional call ``make check`` builds the test programs. If +the -3.3.3 software library is already installed on your system but not +found by the configure script, you can provide the FFTW installation +directory ``\$FFTWDIR`` to configure by + +.. code:: bash + + ./configure --with-fftw3=$FFTWDIR + +This call implies that the FFTW header files are located in +``\$FFTWDIR/include`` and the FFTW library files are located in +``\$FFTWDIR/lib``. Otherwise, one should specify the FFTW include path +``\$FFTWINC`` and the FFTW library path ``\$FFTWLIB`` separately by + +:: + + ./configure --with-fftw3-includedir=$FFTWINC --with-fftw3-libdir=$FFTWLIB + +At the end, this is equivalent to + +:: + + ./configure CPPFLAGS=-I$FFTWINC LDFLAGS=-L$FFTWLIB + +which is more common to experienced users of the Autotools. To install +PFFT in a user specified directory ``\$PFFTINSTDIR`` call configure with +the option + +:: + + ./configure --prefix=$PFFTINSTDIR + +However, this option is mandatory whenever you do not have root +permissions on your machine, since the default install paths of +``configure`` are not accessible by standard users. The PFFT library +must be built with a MPI compiler. In Section [sec:fftw\ :sub:`i`\ nst] +we already described how to hand the right compilers to the +``configure`` script. Some more options are + +:code:\`[\`keywords=]–enable-float: Produces a single-precision version +of PFFT (float) instead of the default double-precision (double); see +[sec:prec]. + +:code:\`[\`keywords=]–enable-long-double: Produces a long-double +precision version of PFFT (long double) instead of the default +double-precision (double); see [sec:prec]. + +``--disable-fortran``: Disables inclusion of Fortran wrapper routines in +the standard PFFT libraries. + +``--disable-tests``: Disables build of test programs. + +For more details on the options of the ``configure`` script call + +:: + + ./configure --help + +How to include PFFT in your program +----------------------------------- + +All programs using PFFT should include its header file + +:: + + #include + +This header includes the FFTW headers ``fftw.h``, ``fftw-mpi.h`` +automatically. Make sure that the compiler can find them by setting the +include flags appropriately. You must also link to the PFFT, FFTW and +FFTW-MPI libraries. On Unix, this means adding +``-lpfft -lfftw3_mpi -lfftw3 -lm`` at the end of the link command. For +example, to build ``pfft_test.c`` use the following compiler invocation + +:: + + mpicc pfft_test.c -I$PFFTINC -I$FFTWINC -L$PFFTLIB -L$FFTWLIB -lpfft -lfftw3_mpi -lfftw3 -lm + +Substitute ``mpicc`` by any other MPI C compiler if you like. +``\$PFFTINC``, ``\$FFTWINC``, ``\$PFFTLIB``, and ``\$FFTWLIB`` denote +the PFFT and FFTW include and library paths, respectively. If you use +the install scripts mentioned in Sect. [sec:pfft-inst], these paths will +be + +:: + + PFFTINC = $HOME/local/pfft-§\pfftversion§/include + FFTWINC = $HOME/local/fftw-§\fftwversion§/include + PFFTINC = $HOME/local/pfft-§\pfftversion§/lib + FFTWINC = $HOME/local/fftw-§\fftwversion§/lib + diff --git a/doc/interface.rst b/doc/interface.rst new file mode 100644 index 0000000..ff25f73 --- /dev/null +++ b/doc/interface.rst @@ -0,0 +1,126 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Interface Layers of the PFFT Library +==================================== + +We give a quick overview of the PFFT interface layers in the order of +increasing flexibility at the example of c2c-FFTs. For r2c-, c2r-, and +r2r-FFT similar interface layer specifications apply. A full reference +list of all PFFT functions is given in Chapter [chap:ref]. + +Basic Interface +--------------- + +The ``_3d`` interface is the simplest interface layer. It is suitable +for the planning of three-dimensional FFTs. + +:: + + ptrdiff_t pfft_local_size_dft_3d( + const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + void pfft_local_block_dft_3d( + const ptrdiff_t *n, MPI_Comm comm_cart, + int pid, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + pfft_plan pfft_plan_dft_3d( + const ptrdiff_t *n, + pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart, + int sign, unsigned pfft_flags); + +Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and +``local_o_start`` are ``ptrdiff_t`` arrays of length ``3``. + +The basic interface generalizes the ``_3d`` interface to FFTs of +arbitrary dimension ``rnk_n``. + +:: + + ptrdiff_t pfft_local_size_dft( + int rnk_n, const ptrdiff_t *n, + MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + void pfft_local_block_dft( + int rnk_n, const ptrdiff_t *n, + MPI_Comm comm_cart, int pid, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + pfft_plan pfft_plan_dft( + int rnk_n, const ptrdiff_t *n, + pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart, + int sign, unsigned pfft_flags); + +Therefore, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, and +``local_o_start`` become arrays of length ``rnk_n``. + +Advanced Interface +------------------ + +The advanced interface introduces the arrays ``ni`` and ``no`` of length +``rnk_n`` that give the pruned FFT input and output size. Furthermore, +the arrays ``iblock`` and ``oblock`` of length ``rnk_pm`` (``rnk_pm`` +being the dimension of the process mesh) serve to adjust the block size +of the input and output block decomposition. The additional parameter +``howmany`` gives the number of transforms that will be computed +simultaneously. + +:: + + ptrdiff_t pfft_local_size_many_dft( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany, + const ptrdiff_t *iblock, const ptrdiff_t *oblock, + MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + void pfft_local_block_many_dft( + int rnk_n, const ptrdiff_t *ni, const ptrdiff_t *no, + const ptrdiff_t *iblock, const ptrdiff_t *oblock, + MPI_Comm comm_cart, int pid, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + pfft_plan pfft_plan_many_dft( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany, + const ptrdiff_t *iblock, const ptrdiff_t *oblock, + pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart, + int sign, unsigned pfft_flags); + +Preliminary: Skip Serial Transformations +---------------------------------------- + +The ``_skipped`` interface extends the ``_many`` interface by adding the +possibility to skip some of the serial FFTs. + +:: + + pfft_plan pfft_plan_many_dft_skipped( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *ni, const ptrdiff_t *no, ptrdiff_t howmany, + const ptrdiff_t *iblock, const ptrdiff_t *oblock, + (red@const int *skip_trafos,@*) + pfft_complex *in, pfft_complex *out, MPI_Comm comm_cart, + int sign, unsigned pfft_flags); + +Hereby, ``skip_trafos`` is an ``int`` array of length ``rnk_pm``\ 1+ +(``rnk_pm`` being the mesh dimension of the communicator ``comm_cart``). +For ``t=0,...,rnk_pm`` set ``skip_trafos[t]=1`` if the ``t``-th serial +transformation should be computed, otherwise set ``skip_trafos[t]=0``. +Note that the local transpositions are always performed, since they are +a prerequisite for the global communication to work. At the moment it is +only possible to skip the whole serial transform along the last +``rnk_n-rnk_pm-1`` dimensions. However, this behaviour can be realized +by a call of a ``(rnk_pm``\ 1)+-dimensional PFFT with + +:: + + for(int t=rnk_pm+1; t` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 2> nul +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PFFT.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PFFT.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/doc/preample.tex b/doc/preample.tex new file mode 100644 index 0000000..a5d548b --- /dev/null +++ b/doc/preample.tex @@ -0,0 +1,105 @@ + +\usepackage[english]{babel} % wordbreaks +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[fixlanguage]{babelbib} % easily change bib language + +%% Vector fonts for PDF +%% \usepackage{ae} % deprecated package, use lmodern instead +\usepackage{lmodern} %standard latex font + +%\usepackage{makeidx} % automatic index generation, required for nomencl.sty +%\usepackage{nomencl} % important symbols in a table at the beginning of document + +%% AMSMath-packages +\usepackage{amsmath} +\usepackage{amsthm} +\usepackage{amssymb} +% \usepackage{amsrefs} +% \usepackage{textcmds} +\usepackage{exscale} % Correct font scaling in formulas + +% \usepackage{subfig} +% \usepackage{graphicx} % Graphics for figures +\usepackage[svgnames, table, hyperref]{xcolor} % +\usepackage{paralist} % compact itemize, enumerate, ... +\usepackage{listings} % source code in LaTeX +\usepackage{multirow} % combine multiple rows in arrays +\usepackage{rotating} + +%% improvements of LaTeX enviroments +\usepackage{scrhack} % avoid warning of scrreprt when loading float package +% \usepackage{float} +% \usepackage{verbatim} +% \usepackage{array} + +% \usepackage{url} % provides \url command for bibtex +\usepackage{hyperref} % provides \url command for bibtex and links to jump within documents +\hypersetup{plainpages=false, colorlinks, linkcolor=black, citecolor=black, urlcolor=blue, +pdftitle={PFFT User Manual}, +pdfauthor={Michael Pippig}, pdfstartview={FitBH}} + +%% adjust numbering +\numberwithin{figure}{chapter} +\numberwithin{table}{chapter} +\numberwithin{equation}{chapter} + +%% activate for algorithms +\usepackage{algpseudocode} +\usepackage[chapter]{algorithm} +\usepackage{algorithmicx} +% \floatname{algorithm}{Algorithmus} % use german title for algorithms +% \numberwithin{algorithm}{chapter} + +\usepackage{xspace} +\usepackage{nicefrac} + +\usepackage{todonotes} + +%% activate for compact page layout +% \usepackage{geometry} +% \geometry{top=30.4mm, left=30.4mm, text={155mm,240mm}, headheight=10mm, headsep=5mm, includemp, marginparwidth=15.4mm} + +%% activate for headline with chapter information on every page +\usepackage{scrpage2} +\pagestyle{scrheadings} +\automark{chapter} +\clearscrheadings +\lehead{\pagemark} +\rehead{\leftmark} +\rohead{\pagemark} +\lohead{\rightmark} +\ofoot[]{} +\cfoot[]{} +\ifoot[]{} + + +\renewcommand*{\thefootnote}{\fnsymbol{footnote}} + +%% the quotchap document style redefines the \chapter and \chapter* commands to +%% create fancy chapter head pages with huge chapter numbers (possibly greyed) and +%% provides commands for adding quotations in the upper left corner of these pages. +% \usepackage[grey]{quotchap} + + +%% very special purpose packages +% \usepackage{faktor} % provides a symbol for factor groups +% \usepackage{slashbox} % diagonaly divide an array field + + +%% experimental +% \usepackage[color]{showkeys} % show all reference keys +% \definecolor{refkey}{gray}{.75} +% \definecolor{labelkey}{gray}{0.75} +% \usepackage{epstopdf} % include .eps files with pdflatex +% \usepackage{marginnote} +%\usepackage{pgf} +%\usepackage{jkpgf} +%\usepackage{pstricks} +%% make pdf-indexfile for inverse search - needs compatible pdf viewer +% \synctex=1 +% \usepackage{pdfsync} % deprecated package, use synctex instead + +\hyphenation{equi-spaced non-equi-spaced} + +\input{shortcuts.tex} % Shortcuts for math symbols diff --git a/doc/reference.rst b/doc/reference.rst new file mode 100644 index 0000000..08a6e39 --- /dev/null +++ b/doc/reference.rst @@ -0,0 +1,1641 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +PFFT Reference +============== + +Files and Data Types +-------------------- + +You must include the PFFT header file by + +:: + + #include + +in the preamble of each source file that calls PFFT. This header +automatically includes ``fftw.h`` and ``fftw3-mpi.h``. Therefore, PFFT +can use the ``fftw_complex`` data type defined in ``fftw.h``, see . Note +that ``fftw_complex`` is defined to be the C99 native complex whenever +```` is included *before* ````, ```` and +````. Otherwise it is defined as + +:: + + typedef double fftw_complex[2]; + +For the sake of a clean namespace we define the wrapper data type +``pfft_complex`` as + +:: + + typedef fftw_complex pfft_complex; + +that can be used equivallently to ``fftw_complex``. Futhermore, we +define the wrapper functions + +:: + + void *pfft_malloc(size_t n); + double *pfft_alloc_real(size_t n); + pfft_complex *pfft_alloc_complex(size_t n); + void pfft_free(void *p); + +as substitues for their corresponding FFTW equivalents, see . Note that +memory allocated by one of these functions must be freed with +``pfft_free`` (or its equivalent ``fftw_free``). Because of the +performance reasons given in  we recommend to use one of the ``pfft_`` +(or its equivalent ``fftw_``) allocation functions for all arrays +containing FFT inputs and outputs. However, PFFT will also work +(possibly slower) with any other memory allocation method. + +Different precisions are handled as in FFTW: That is ``pfft_`` functions +and datatypes become ``pfftf_`` (single precision) or ``pfftl_`` (long +double precision) prefixed. Quadruple precision is not yet supported. +The main problem is that we do not know about a suitable MPI datatype to +represent ``__float128``. + +MPI Initialization +------------------ + +Initialization and cleanup of PFFT in done in the same way as for +FFTW-MPI, see . In order to keep a clean name space, PFFT offers the +wrapper functions + +:: + + void pfft_init(void); + void pfft_cleanup(void); + +that can be used as substitutes for ``fftw_mpi_init`` and +``fftw_mpi_cleanup``, respectively. + +Using PFFT Plans +---------------- + +PFFT follows exactly the same workflow as FFTW-MPI. A plan created by +one of the functions given in Section [sec:create-plan] is executed with + +:: + + void pfft_execute(const pfft_plan plan); + +and freed with + +:: + + void pfft_destroy_plan(const pfft_plan plan); + +Note, that you can *not* apply ``fftw_mpi_execute`` or ``fftw_destroy`` +on PFFT plans. + +The new array execute functions are given by + +:: + + void pfft_execute_dft(const pfft_plan plan, pfft_complex *in, pfft_complex *out); + void pfft_execute_dft_r2c(const pfft_plan plan, double *in, pfft_complex *out); + void pfft_execute_dft_c2r(const pfft_plan plan, pfft_complex *in, double *out); + void pfft_execute_r2r(const pfft_plan plan, double *in, double *out); + +The arrays given by ``in`` and ``out`` must have the correct size and +the same alignement as the array that were used to create the plan, just +as it is the case for FFTW, see [fftw-new-array]. + +Data Distribution Functions +--------------------------- + +Complex-to-Complex FFT +~~~~~~~~~~~~~~~~~~~~~~ + +:: + + ptrdiff_t pfft_local_size_dft_3d( + const ptrdiff_t *n, MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + ptrdiff_t pfft_local_size_dft( + int rnk_n, const ptrdiff_t *n, + MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + ptrdiff_t pfft_local_size_many_dft( + int rnk_n, const ptrdiff_t *n, const ptrdiff_t *ni, const ptrdiff_t *no, + ptrdiff_t howmany, const ptrdiff_t *iblock, const ptrdiff_t *oblock, + MPI_Comm comm_cart, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + +Compute the data distribution of a parallel, complex input/output +discrete Fourier transform (DFT) in two or more dimensions, returning +the number of *complex* numbers that must be allocated to hold the +parallel transform. + +Arguments: + +``rnk_n`` is the rank of the transform (typically the size of the arrays +``n``, ``ni``, ``no``) that can be any integer :math:`\ge 2`. The +``_3d`` planner corresponds to a ``rnk_n`` of 3. + +The array ``n`` of size ``rnk_n`` specifies the transform dimensions. +They can be any positive integer. + +The array ``ni`` of size ``rnk_n`` specifies the input array dimensions. +They can be any positive integer with ``ni[t] <= n[t]`` for all +dimensions ``t=0,...,rnk_n-1``. For ``ni[t] alloc_local ? alloc_local_gc : alloc_local); + +Here, ``alloc_local`` gives the number of data elements that are +necessary to hold all steps of the parallel FFT, while +``alloc_local_gc`` gives the number of data elements that are necessary +to hold all steps of the ghost cell communication. Note that we took the +maximum of these both numbers as argument for ``pfft_alloc_complex``. +The code snippet for real valued arrays looks very similar. + +:: + + /* Get parameters of data distribution */ + /* alloc_local, local_no, local_o_start are given in complex units */ + /* local_ni, local_i_start are given in real units */ + alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE, + local_ni, local_i_start, local_no, local_o_start); + + /* alloc_local_gc, local_ngc, local_gc_start are given in real units */ + alloc_local_gc = pfft_local_size_gc_3d( + local_ni, local_i_start, gc_below, gc_above, + local_ngc, local_gc_start); + + /* Allocate enough memory for FFT and ghost cells */ + double *rdata = pfft_alloc_real(alloc_local_gc > 2*alloc_local ? alloc_local_gc : 2*alloc_local); + +Note that the number of real valued data elements is given by two times +``alloc_local`` for r2c transforms, whereas the last line would change +into + +:: + + double *rdata = pfft_alloc_real(alloc_local_gc > alloc_local ? alloc_local_gc : alloc_local); + +for r2r transforms. + +Plan Creation for Complex Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions create ghost cell plans that operate on complex +valued arrays, i.e., + +c2c inputs, + +c2c outputs, + +r2c outputs (use flag ``PFFT_GC_C2R``), and + +c2r inputs (use flag ``PFFT_GC_R2C``). + +Corresponding to the three interface layers for FFT planning, there are +the following three layers for creating a complex valued ghost cell +plan: + +:: + + pfft_gcplan pfft_plan_cgc_3d( + const ptrdiff_t *n, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags); + pfft_gcplan pfft_plan_cgc( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags); + pfft_gcplan pfft_plan_many_cgc( + int rnk_n, const ptrdiff_t *n, + ptrdiff_t howmany, const ptrdiff_t *block, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + pfft_complex *data, MPI_Comm comm_cart, unsigned gc_flags); + +Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the +variables that were used for the PFFT plan creation. However, only the +case ``rnk_n==3`` is completely implemented at the moment. Remember that +``n`` is the logical FFT size just as it is the case for FFT planning. +The block size ``block`` must be equal to ``iblock`` or ``oblock`` +depending on whether the ghost cell plan works on the FFT input or +output array. Analogously, ``data`` becomes ``in`` or ``out``. Set the +number of ghost cells by ``gc_below`` and ``gc_above`` as described in +Section [sec:gc]. The flags ``gc_flags`` must be set appropriately to +the flags that were passed to the FFT planner. Table [tab:map-cgcflags] +shows the ghost cell planner flags that must be set in dependence on the +listed FFT planner flags. + +[h] + ++----------------------------+-------------------------------+ +| FFT flag | ghost cell flag | ++============================+===============================+ +| ``PFFT_TRANSPOSED_NONE`` | ``PFFT_GC_TRANSPOSED_NONE`` | ++----------------------------+-------------------------------+ +| ``PFFT_TRANSPOSED_IN`` | ``PFFT_GC_TRANSPOSED`` | ++----------------------------+-------------------------------+ +| ``PFFT_TRANSPOSED_OUT`` | ``PFFT_GC_TRANSPOSED`` | ++----------------------------+-------------------------------+ + +[tab:map-cgcflags] + +In addition, we introduce the flag ``PFFT_GC_R2C`` (and its equivalent +``PFFT_GC_C2R``) to handle the complex array storage format of r2c and +c2r transforms. In fact, these two flags imply an ordinary complex +valued ghost cell communication on an array of size +``n[0] x ... x n[rnk_n-2] x (n[rnk_n-1]/2``\ 1)+. Please note that we +wrongly assume periodic boundary conditions in this case. Therefore, you +should ignore the data elements with the last index behind +``n[rnk_n-1]/2``. + +Plan Creation for Real Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions create ghost cell plans that operate on real +valued arrays, i.e., + +r2r inputs, + +r2r outputs, + +r2c inputs, and + +c2r outputs. + +Corresponding to the three interface layers for FFT planning, there are +the following three layers for creating a real valued ghost cell plan: + +:: + + pfft_gcplan pfft_plan_rgc_3d( + const ptrdiff_t *n, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + double *data, MPI_Comm comm_cart, unsigned gc_flags); + pfft_gcplan pfft_plan_rgc( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + double *data, MPI_Comm comm_cart, unsigned gc_flags); + pfft_gcplan pfft_plan_many_rgc( + int rnk_n, const ptrdiff_t *n, + ptrdiff_t howmany, const ptrdiff_t *block, + const ptrdiff_t *gc_below, const ptrdiff_t *gc_above, + double *data, MPI_Comm comm_cart, unsigned gc_flags); + +Hereby, ``rnk_n``, ``n``, ``howmany`` and ``comm_cart`` must be the +variables that were used for the PFFT plan creation. Remember that ``n`` +is the logical FFT size just as it is the case for FFT planning. The +block size ``block`` must be equal to ``iblock`` or ``oblock`` depending +on whether the ghost cell plan works on the FFT input or output array. +Analogously, ``data`` becomes ``in`` or ``out``. Set the number of ghost +cells by ``gc_below`` and ``gc_above`` as described in +Section [sec:gc:local-size]. The flags ``gc_flags`` must be set +appropriately to the flags that were passed to the FFT planner. +Table [tab:map-rgcflags] shows the ghost cell planner flags that must be +set in dependence on the listed FFT planner flags. + +[h] + ++----------------------------+-------------------------------+ +| FFT flag | ghost cell flag | ++============================+===============================+ +| ``PFFT_TRANSPOSED_NONE`` | ``PFFT_GC_TRANSPOSED_NONE`` | ++----------------------------+-------------------------------+ +| ``PFFT_TRANSPOSED_IN`` | ``PFFT_GC_TRANSPOSED`` | ++----------------------------+-------------------------------+ +| ``PFFT_TRANSPOSED_OUT`` | ``PFFT_GC_TRANSPOSED`` | ++----------------------------+-------------------------------+ +| ``PFFT_PADDED_R2C`` | ``PFFT_GC_PADDED_R2C`` | ++----------------------------+-------------------------------+ +| ``PFFT_PADDED_C2R`` | ``PFFT_GC_PADDED_C2R`` | ++----------------------------+-------------------------------+ + +[tab:map-rgcflags] + +Note that the flag ``PFFT_GC_PADDED_R2C`` (or its equivalent +``PFFT_GC_PADDED_C2R``) implies an ordinary real valued ghost cell +communication on an array of size +``n[0] x ... x n[rnk_n-2] x 2*(n[rnk_n-1]/2``\ 1)+. Especially, the +padding elements will be handles as normal data points, i.e., you must +we aware that the numbers of ghost cells ``gc_below[rnk_n-1]`` and +``gc_above[rnk_n-1]`` include the number of padding elements. + +Inofficial Flags +~~~~~~~~~~~~~~~~ + +Ghost Cell Execution Timer +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +PFFT ghost cell plans automatically accumulate the local run times of +every call to ``pfft_exchange`` and ``pfft_reduce``. For most +applications it is sufficient to print run time of a plan ``ths`` +averaged over all runs with + +:: + + void pfft_print_average_gctimer( + const pfft_gcplan ths, MPI_Comm comm); + +Note, that for each timer the maximum time over all processes is reduced +to rank ``0`` of communicator ``comm``, i.e., a call to ``MPI_Reduce`` +is performed and the output is only printed on this process. The +following function works in the same way but prints more verbose output + +:: + + void pfft_print_average_gctimer_adv( + const pfft_gcplan ths, MPI_Comm comm); + +To write the averaged run time of a ghost cell plan ``ths`` into a file +called ``name`` use + +:: + + void pfft_write_average_gctimer( + const pfft_gcplan ths, const char *name, MPI_Comm comm); + void pfft_write_average_gctimer_adv( + const pfft_gcplan ths, const char *name, MPI_Comm comm); + +Again, the output is only written on rank ``0`` of communicator +``comm``. + +Discard all the recorded run times with + +:: + + void pfft_reset_gctimers( + pfft_gcplan ths); + +This function is called per default at the end of every ghost cell plan +creation function. + +In order to access the run times directly a new typedef ``pfft_timer`` +is introduced. The following functions return a copy of the timer +corresponding to ghost cell plan ``ths`` that accumulated the time for +ghost cell exchange or ghost cell reduce, respectively: + +:: + + pfft_gctimer pfft_get_gctimer_exg( + const pfft_gcplan ths); + pfft_gctimer pfft_get_gctimer_red( + const pfft_gcplan ths); + +Note that the memory of the returned ``pfft_gctimer`` must be released +with + +:: + + void pfft_destroy_gctimer( + pfft_gctimer ths); + +as soon as the timer is not needed anymore. + +In the following we introduce some routines to perform basic operations +on timers. For all functions with a ``pfft_gctimer`` return value you +must use ``pfft_destroy_gctimer`` in order to release the allocated +memory of the timer. Create a copy of a ghost cell timer ``orig`` with + +:: + + pfft_gctimer pfft_copy_gctimer( + const pfft_gctimer orig); + +Compute the average, local time over all runs of a timer ``ths`` with + +:: + + void pfft_average_gctimer( + pfft_gctimer ths); + +Create a new timer that contains the sum of two timers ``sum1`` and +``sum2`` with + +:: + + pfft_gctimer pfft_add_gctimers( + const pfft_gctimer sum1, const pfft_gctimer sum2); + +Create a timer that contains the maximum times of all the timers ``ths`` +from all processes belonging to communicator ``comm`` with + +:: + + pfft_gctimer pfft_reduce_max_gctimer( + const pfft_gctimer ths, MPI_Comm comm); + +Since this function calls ``MPI_Reduce``, only the first process (rank +0) of ``comm`` will get the desired data while all the other processes +have timers with undefined values. + +Note, that you can not access the elements of a timer directly, since it +is only a pointer to a ``struct``. However, PFFT offers a routine that +creates an array and copies all the entries of the timer into it + +:: + + void pfft_convert_gctimer2vec( + const pfft_gctimer ths, double *times); + +Remember to use ``free`` in order to release the allocated memory of the +returned array at the moment it is not needed anymore. The entries of +the returned array are ordered as follows: + +number of ``pfft_execute`` runs ``iter`` + +local run time of all runs + +local run time of zero padding (make room for incoming ghost cells and +init with zeros) + +local run time of the ghost cell exchange or reduce (depending on the +timer) + +The complementary function + +:: + + pfft_gctimer pfft_convert_vec2gctimer( + const double *times); + +creates a timer and fills it’s entries with the data from array +``times``. Thereby, the entries of ``times`` must be in the same order +as above. + +Useful Tools +------------ + +The following functions are useful tools but are not necessarily needed +to perform parallel FFTs. + +Initializing Complex Inputs and Checking Outputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To fill a complex array ``data`` with reproducible, complex values you +can use one of the functions + +:: + + void pfft_init_input_complex_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + pfft_complex *data); + void pfft_init_input_complex( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + pfft_complex *data); + +Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` of length +``rnk_n`` (``rnk_n==3`` for ``_3d``) give the size of the FFT, the local +array size and the local array offset as computed by the array +distribution functions described in Section [sec:local-size] The +functions + +:: + + double pfft_check_output_complex_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + const pfft_complex *data, MPI_Comm comm); + double pfft_check_output_complex( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + const pfft_complex *data, MPI_Comm comm); + +compute the :math:`l_1`-norm between the elements of array ``data`` and +values produced by ``pfft_init_input_complex_3d``, +``pfft_init_input_complex``. In addition, we supply the following +functions for setting all the input data to zero at once + +:: + + void pfft_clear_input_complex_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + pfft_complex *data); + void pfft_clear_input_complex( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + pfft_complex *data); + +Note, that these functions can be combined for a quick consistency check +of the FFT. Since a forward FFT followed by a backward FFT reproduces +the inputs up to a scaling factor, the following code snippet should +give a result equal to zero up to machine precision. + +:: + + /* Initialize input with random numbers */ + pfft_init_input_complex_3d(n, local_ni, local_i_start, + in); + + /* execute parallel forward FFT */ + pfft_execute(plan_forw); + + /* clear the old input */ + if(in != out) + pfft_clear_input_complex_3d(n, local_ni, local_i_start, in); + + /* execute parallel backward FFT */ + pfft_execute(plan_back); + + /* Scale data */ + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++) + in[l] /= (n[0]*n[1]*n[2]); + + /* Print error of back transformed data */ + err = pfft_check_output_complex_3d(n, local_ni, local_i_start, in, comm_cart_2d); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); + pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); + +Hereby, we set all inputs equal to zero after the forward FFT in order +to be sure that all the final results are actually computed by the +backward FFT instead of being a buggy relict of the forward transform. + +Initializing Real Inputs and Checking Outputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To fill a real array ``data`` with reproducible, real values use one of +the functions + +:: + + void pfft_init_input_real_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + double *data); + void pfft_init_input_real( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + double *data); + +Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the +size of the FFT, the local array size and the local array offset as +computed by the array distribution functions described in +Section [sec:local-size] The functions + +:: + + double pfft_check_output_real_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + const pfft_complex *data, MPI_Comm comm); + double pfft_check_output_real( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + const pfft_complex *data, MPI_Comm comm); + +compute the :math:`l_1`-norm between the elements of array ``data`` and +values produced by ``pfft_init_input_real_3d``, +``pfft_init_input_real``. In addition, we supply the following functions +for setting all the input data to zero at once + +:: + + void pfft_clear_input_real_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + double *data); + void pfft_clear_input_real( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + double *data); + +Note, that both ``pfft_init_input_real*`` functions will set all array +elements to zero were ``local_n `` local\ :sub:`ns`\ tart >= n+. In +addition, both ``pfft_check_output_real*`` function will ignore all the +errors resulting from these elements. Therefore, it is safe to use all +these functions for a consistency check of a r2c transform followed by a +c2r transform since all padding elements will be ignored. + +Initializing r2c/c2r Inputs and Checking Outputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The real inputs of a r2c transform can be initialized with the functions +decribed in Section [sec:init-data-3d-r2r]. However, generating suitable +inputs for a c2r transform requires more caution. In order to get real +valued results of a DFT the complex input coefficients need to satisfy +an radial Hermitian symmetry, i.e., +:math:`X[{{\boldsymbol{k}}}] = {X^*[-{{\boldsymbol{k}}}]}`. We use the +following trick to generate the complex input values for c2r transforms. +Assume any :math:`{{\boldsymbol{N}}}`-periodic complex valued function +:math:`f`. It can be easily shown that the values +:math:`X[{{\boldsymbol{k}}}] := \frac{1}{2}\left(f({{\boldsymbol{k}}})+f^*(-{{\boldsymbol{k}}})\right)` +satisfy the radial Hermitian symmetry. + +To fill a complex array ``data`` with reproducible, complex values that +fulfill the radial Hermitian symmetry use one of the functions + +:: + + void pfft_init_input_complex_hermitian_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + double *data); + void pfft_init_input_complex_hermitian( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + double *data); + +Hereby, the arrays ``n``, ``local_n`` and ``local_n_start`` give the +size of the FFT, the local array size and the local array offset as +computed by the array distribution functions described in +Section [sec:local-size] The functions + +:: + + double pfft_check_output_complex_hermitian_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + const pfft_complex *data, MPI_Comm comm); + double pfft_check_output_complex_hermitian( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + const pfft_complex *data, MPI_Comm comm); + +compute the :math:`l_1`-norm between the elements of array ``data`` and +values produced by ``pfft_init_input_complex_hermitian_3d``, +``pfft_init_input_complex_hermitian``. In addition, we supply the +following functions for setting all the input data to zero at once + +:: + + void pfft_clear_input_complex_hermitian_3d( + const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_n_start, + pfft_complex *data); + void pfft_clear_input_complex_hermitian( + int rnk_n, const ptrdiff_t *n, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + pfft_complex *data); + +Note, that these functions can also be used in order to generate complex +inputs with radial Hermitian symmetry for ordinary c2c transforms. Of +course the results of such a c2c DFT will have all imaginary parts equal +to zero up to machine precision. + +Operations on Arrays of Type ``ptrdiff_t`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following routines are shortcuts for the elementwise manipulation of +``ptrdiff_t`` valued arrays. In the following, all arrays ``vec``, +``vec1``, and ``vec2`` are of length ``d`` and type ``ptrdiff_t``. + +:: + + ptrdiff_t pfft_prod_INT( + int d, const ptrdiff_t *vec); + +Returns the product over all elements of ``vec``. + +:: + + ptrdiff_t pfft_sum_INT( + int d, const ptrdiff_t *vec); + +Returns the sum over all elements of ``vec``. + +:: + + int pfft_equal_INT( + int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2); + +Returns 1 if both arrays have equal entries, 0 otherwise. + +:: + + void pfft_vcopy_INT( + int d, const ptrdiff_t *vec1, + ptrdiff_t *vec2); + +Copies the elements of ``vec1`` into ``vec2``. + +:: + + void pfft_vadd_INT( + int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2, + ptrdiff_t *sum); + +Fills ``sum`` with the componentwise sum of ``vec1`` and ``vec2``. + +:: + + void pfft_vsub_INT( + int d, const ptrdiff_t *vec1, const ptrdiff_t *vec2, + ptrdiff_t *sum); + +Fills ``sum`` with the componentwise difference of ``vec1`` and +``vec2``. + +Print Three-Dimensional Arrays in Parallel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the following routine to print the elements of a block decomposed +three-dimensional (real or complex valued) array ``data`` in a nicely +formatted way. + +:: + + void pfft_apr_real_3d( + const double *data, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + const char *name, MPI_Comm comm); + void pfft_apr_complex_3d( + const pfft_complex *data, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + const char *name, MPI_Comm comm); + +Obviously, this makes only sense for arrays of moderate size. The block +decomposition is given by ``local_n``, ``local_n_start`` as returned by +the array distribution function decribed in Section [sec:local-size]. +Furthermore, some arbitrary string ``name`` can be added at the +beginning of each output - typically this will be the name of the array. +Communicator ``comm`` must be suitable to the block decomposition and is +used to synchronize the outputs over all processes. + +Generalizations for the case where the dimensions of the local arrays +are permuted are given by + +:: + + void pfft_apr_real_permuted_3d( + const double *data, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + int perm0, int perm1, int perm2, + const char *name, MPI_Comm comm); + void pfft_apr_complex_permuted_3d( + const pfft_complex *data, + const ptrdiff_t *local_n, const ptrdiff_t *local_start, + int perm0, int perm1, int perm2, + const char *name, MPI_Comm comm); + +Hereby, ``perm0``, ``perm1``, and ``perm2`` give the array’s permutation +of dimension. + +Reading Command Line Arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following function offers a simple way to read command line +arguments into an array ``parameter``. + +:: + + void pfft_get_args( + int argc, char **argv, const char *name, + int neededArgs, unsigned type, + void *parameter); + +Hereby, ``argc`` and ``argv`` are the standard argument of the ``main`` +routine. Furthermore, ``name``, ``neededAgrs``, and ``type`` give the +name, number of entries and the type of the command line argument. +Supported types are ``PFFT_INT``, ``PFFT_PTRDIFF_T``, ``PFFT_FLOAT``, +``PFFT_DOUBLE``, and ``PFFT_UNSIGNED``, which denote the standard C type +that is used for typecasting. In addition, you can use the special type +``PFFT_SWITCH`` that is an integer type equal to one if the +corresponding command line argument is given. The array ``parameter`` +must be of sufficient size to hold ``neededArgs`` elements of the given +data type. Special attention is given + +For example, a program containing the following code snippet + +:: + + double x=0.1; + pfft_get_args(argc, argv, "-pfft_x", 1, PFFT_DOUBLE, &x); + int np[2]={2,1}; + pfft_get_args(argc, argv, "-pfft_np", 2, PFFT_INT, np); + ptrdiff_t n[3]={32,32,32}; + pfft_get_args(argc, argv, "-pfft_n", 3, PFFT_PTRDIFF_T, n); + int switch=0; + pfft_get_args(argc, argv, "-pfft_on", 0, PFFT_SWITCH, switch); + +that is executed via + +:: + + ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on + +will read ``x=3.1``, ``np[2] = (2,3)``, ``n[3]= (8,16,32)``, and turn on +the ``switch=1``. Note the address operator ``&`` in front of ``x`` in +the second line! Furthermore, note that the initialization of all +variables with default values before the call of ``pfft_get_args`` +avoids trouble if the user does not provide all the command line +arguments. + +Parallel Substitutes for ``vprintf``, ``fprintf``, and ``printf`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions are similar to the standard C function +``vfprintf``, ``fprintf`` and ``printf`` with the exception, that only +rank ``0`` within the given communicator ``comm`` will produce output. +The intension is to avoid the flood of messages that is produced when +simple ``printf`` statement are run in parallel. + +:: + + void pfft_vfprintf( + MPI_Comm comm, FILE *stream, const char *format, va_list ap); + void pfft_fprintf( + MPI_Comm comm, FILE *stream, const char *format, ...); + void pfft_printf( + MPI_Comm comm, const char *format, ...); + +Generating Periodic Cartesian Communicators +------------------------------------------- + +Based on the processes that are part of the given communicator ``comm`` +the following routine + +:: + + int pfft_create_procmesh_1d( + MPI_Comm comm, int np0, + MPI_Comm *comm_cart_1d); + +allocates and creates a one-dimensional, periodic, Cartesian +communicator ``comm_cart_1d`` of size ``np0``. Thereby, a non-zero error +code is returned whenever ``np0`` does not fit the size of ``comm``. The +memory of the generated communicator should be released with +``MPI_Comm_free`` after usage. Analogously, use + +:: + + int pfft_create_procmesh_2d( + MPI_Comm comm, int np0, int np1, + MPI_Comm *comm_cart_2d); + +in order to allocate and create two-dimensional, periodic, Cartesian +communicator ``comm_cart_2d`` of size ``np0*np1`` or + +:: + + int pfft_create_procmesh( + int rnk_np, MPI_Comm comm, const int *np, + MPI_Comm *comm_cart); + +in order to allocate and create a ``rnk_np``-dimensional, periodic, +Cartesian communicator of size ``np[0]*np[1]*...*np[rnk_np-1]``. Hereby, +``np`` is an array of length ``rnk_np``. Again, the memory of the +generated communicator should be released with ``MPI_Comm_free`` after +usage. diff --git a/doc/reference.tex b/doc/reference.tex index 77d8a74..315148b 100644 --- a/doc/reference.tex +++ b/doc/reference.tex @@ -1177,7 +1177,7 @@ \subsection{Reading Command Line Arguments} \begin{lstlisting} ./test -pfft_x 3.1 -pfft_np 2 3 -pfft_n 8 16 32 -pfft_on \end{lstlisting} -will read \code{x=3.1}, \code{np[2] = \{2,3\}}, \code{n[3]=\{8,16,32\}}, and turn on the \code{switch=1}. +will read \code{x=3.1}, \code{np[2] = (2,3)}, \code{n[3]= (8,16,32)}, and turn on the \code{switch=1}. Note the address operator \code{&} in front of \code{x} in the second line! Furthermore, note that the initialization of all variables with default values before the call of \code{pfft_get_args} avoids trouble if the user does not provide all the command line arguments. diff --git a/doc/shortcuts.tex b/doc/shortcuts.tex index 7d664e5..d02b976 100644 --- a/doc/shortcuts.tex +++ b/doc/shortcuts.tex @@ -6,23 +6,23 @@ % Shortcuts for math symbols. -\newcommand{\N}{\ensuremath{\mathbb{N}}} -\newcommand{\T}{\ensuremath{\mathbb{T}}} -\renewcommand{\S}{\ensuremath{\mathbb{S}}} -\newcommand{\NZ}{\ensuremath{\mathbb{N}_{0}}} -\newcommand{\Z}{\ensuremath{\mathbb{Z}}} -\newcommand{\R}{\ensuremath{\mathbb{R}}} -\newcommand{\Rp}{\ensuremath{\mathbb{R}_{+}}} -\newcommand{\Rn}{\ensuremath{\mathbb{R}^n}} -\newcommand{\Rnn}{\ensuremath{\mathbb{R}^{n \times n}}} -\newcommand{\C}{\ensuremath{\mathbb{C}}} -\newcommand{\cO}{\ensuremath{\mathcal{O}}} -\newcommand{\tT}{\ensuremath{\text{\tiny{T}}}} -\newcommand{\ti}{\ensuremath{\text{\scriptsize{i}}}} -\newcommand{\e}{{\ensuremath{\mathrm{e}}}} -\newcommand{\eim}[1]{\ensuremath{\mathrm{e}^{-2\pi{\ti} #1}}} -\newcommand{\eip}[1]{\ensuremath{\mathrm{e}^{+2\pi{\ti} #1}}} -\renewcommand{\mathbf}[1]{\ensuremath{\boldsymbol{#1}}} +\newcommand{\N}{{\mathbb{N}}} +\newcommand{\T}{{\mathbb{T}}} +\renewcommand{\S}{{\mathbb{S}}} +\newcommand{\NZ}{{\mathbb{N}_{0}}} +\newcommand{\Z}{{\mathbb{Z}}} +\newcommand{\R}{{\mathbb{R}}} +\newcommand{\Rp}{{\mathbb{R}_{+}}} +\newcommand{\Rn}{{\mathbb{R}^n}} +\newcommand{\Rnn}{{\mathbb{R}^{n \times n}}} +\newcommand{\C}{{\mathbb{C}}} +\newcommand{\cO}{{\mathcal{O}}} +\newcommand{\tT}{{\text{T}}} +\newcommand{\ti}{{\text{i}}} +\newcommand{\e}{{{\mathrm{e}}}} +\newcommand{\eim}[1]{{\mathrm{e}^{-2\pi{\ti} #1}}} +\newcommand{\eip}[1]{{\mathrm{e}^{+2\pi{\ti} #1}}} +\renewcommand{\mathbf}[1]{{\boldsymbol{#1}}} \newcommand{\ds}{\displaystyle} \newcommand{\sinc}{\textrm{sinc}} \newcommand{\dist}{\textrm{dist}} @@ -44,9 +44,9 @@ {\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}} \newcommand{\bigtimes}{\BIGOP{\times}} \def\invisible#1{\textcolor{white}{#1}} -\newcommand{\Vect}[1]{\ensuremath{\mathbf{#1}}} -\newcommand{\Mat}[1]{\ensuremath{\mathbf{#1}}} -\newcommand{\Cal}[1]{\ensuremath{\mathcal{#1}}} +\newcommand{\Vect}[1]{{\mathbf{#1}}} +\newcommand{\Mat}[1]{{\mathbf{#1}}} +\newcommand{\Cal}[1]{{\mathcal{#1}}} \newcommand{\fft}{\textsf{FFT}} \newcommand{\ousetarrow}[2]{\overset{\textsf{#1}}{\underset{\textsf{#2}}{\rightarrow}}} @@ -183,7 +183,8 @@ % belowskip= -2ex} % % \newcommand{\code}[1]{\linebreak[2]{\ttfamily #1}} - \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}} + \newcommand{\code}[1]{:code:`#1`} +% \newcommand{\code}[2][\empty]{\ifthenelse{\equal{#1}{\empty}}{\lstinline!#2!}{\lstinline[#1]!#2!}} } %% own enviroment for case differentiation diff --git a/doc/tutorial.rst b/doc/tutorial.rst new file mode 100644 index 0000000..cb65b46 --- /dev/null +++ b/doc/tutorial.rst @@ -0,0 +1,573 @@ +[2]ifpackageloaded#1#2 [2]ifpackageloaded#1#2 [3]ifpackageloaded#1#2#3 + +#1 + +Tutorial +======== + +The following chapter describes the usage of the PFFT library at the +example of a simple test file in the first section, followed by the more +advanced features of PFFT in the next sections. + +A first parallel transform - Three-dimensional FFT with two-dimensional data decomposition +------------------------------------------------------------------------------------------ + +We explain the basic steps for computing a parallel FFT with the PFFT +library at the example of the short test program given by +Listing [lst:man\ :sub:`c`\ 2c]. This test computes a three-dimensional +c2c-FFT on a two-dimensional process mesh. The source code +``manual_c2c_3d.c`` can be found in directory ``tests/`` of the +library’s source code tree. + +After initializing MPI with ``MPI_Init`` and before calling any other +PFFT routine initialize the parallel FFT computations via + +:: + + void pfft_init(void); + +MPI introduces the concept of communicators to store all the topological +information of the physical process layout. PFFT requires to be called +on a process mesh that corresponds to a periodic, Cartesian +communicator. We assist the user in creating such a communicator with +the following routine + +:: + + int pfft_create_procmesh_2d( + MPI_Comm comm, int np0, int np1, + MPI_Comm *comm_cart_2d); + +This routine uses the processes within the communicator ``comm`` to +create a two-dimensional process grid of size ``np0`` x ``np1`` and +stores it into the Cartesian communicator ``comm_cart_2d``. Note that +``comm_cart_2d`` is allocated by the routine and must be freed with +``MPI_Comm_free`` after usage. The input parameter ``comm`` is a +communicator, indicating which processes will participate in the +transform. Choosing ``comm`` as ``MPI_COMM_WORLD`` implies that the FFT +is computed on all available processes. + +At the next step we need to know the data decomposition of the input and +output array, that depends on the array sizes, the process grid and the +chosen parallel algorithm. Therefore, we call + +:: + + ptrdiff_t pfft_local_size_3d( + ptrdiff_t *n, MPI_Comm comm_cart_2d, unsigned pfft_flags, + ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + ptrdiff_t *local_no, ptrdiff_t *local_o_start); + +Hereby, ``n``, ``local_ni``, ``local_i_start``, ``local_no``, +``local_o_start`` are arrays of length :math:`3` that must be allocated. +The return value of this function equals the size of the local complex +array that needs to be allocated by every process. In most cases, this +coincides with the product of the local array sizes – but may be bigger, +whenever the parallel algorithm needs some extra storage. The input +value ``n`` gives the three-dimensional FFT size and the flag +``pfft_flags`` serves to adjust some details of the parallel execution. +For the sake of simplicity, we restrict our self to the case +``pfft_flags=PFFT_TRANSPOSED_NONE`` for a while and explain the more +sophisticated flags at a later point. The output arrays ``local_ni`` and +``local_i_start`` give the size and the offset of the local input array +that result from the parallel block distribution of the global input +array, i.e., every process owns the input data ``in[k[0],k[1],k[2]]`` +with ``local_i_start[t] <= k[t] < local_i_start[t] `` +local\ :sub:`n`\ i[t]+ for ``t=0,1,2``. Analogously, the output +parameters ``local_o_start`` and ``local_no`` contain the size and the +offset of the local output array. + +Afterward, the input and output arrays must be allocated. Hereby, + +:: + + pfft_complex* pfft_alloc_complex(size_t size); + +is a simple wrapper of ``fftw_alloc_complex``, which in turn allocates +the memory via ``fftw_malloc`` to ensure proper alignment for SIMD. Have +a look at the FFTW user manual  for more details on SIMD memory +alignment and ``fftw_malloc``. Nevertheless, you can also use any other +dynamic memory allocation. + +The planning of a single three-dimensional parallel FFT of size ``n[0]`` +x ``n[1]`` x ``n[2]`` is done by the function + +:: + + pfft_plan pfft_plan_dft_3d( + ptrdiff_t *n, pfft_complex *in, pfft_complex *out, + MPI_Comm comm_cart_2d, int sign, unsigned pfft_flags); + +We provide the address of the input and output array by the pointers +``in`` and ``out``, respectively. An inplace transform is assumed if +these pointers are equal. The integer ``sign`` gives the sign in the +exponential of the FFT. Possible values are ``PFFT_FORWARD`` +(:math:`-1`) and ``PFFT_BACKWARD`` (:math:`+1`). Flags passed to the +planner via ``pfft\_flags`` must coincide with the flags that were +passed to ``pfft_local_size_3d``. Otherwise the data layout of the +parallel execution may not match calculated local array sizes. As return +value we get a PFFT plan, some structure that stores all the information +needed to perform a parallel FFT. + +Once the plan is generated, we are allowed to fill the input array +``in``. Note, that per default the planning step ``pfft_plan_dft_3d`` +will overwrite input array ``in``. Therefore, you should not write any +sensitive data into ``in`` until the plan was generated. For simplicity, +our test program makes use of the library function + +:: + + void pfft_init_input_complex_3d( + ptrdiff_t *n, ptrdiff_t *local_ni, ptrdiff_t *local_i_start, + pfft_complex *in); + +to fill the input array with some numbers. Alternatively, one can fill +the array with a function ``func`` of choice and the following loop that +takes account of the parallel data layout + +:: + + ptrdiff_t m=0; + for(ptrdiff_t k0=0; k0 < local_ni[0]; k0++) + for(ptrdiff_t k1=0; k1 < local_ni[1]; k1++) + for(ptrdiff_t k2=0; k2 < local_ni[2]; k2++) + in[m++] = func(k0 + local_i_start[0], + k1 + local_i_start[1], + k2 + local_i_start[2]); + +The parallel FFT is computed when we execute the generated plan via + +:: + + void pfft_execute(const pfft_plan plan); + +Now, the results can be read from ``out`` with an analogous +three-dimensional loop. If we do not want to execute another parallel +FFT of the same type, we free the allocated memory of the plan with + +:: + + void pfft_destroy_plan(pfft_plan plan); + +Additionally, we use + +:: + + int MPI_Comm_free(MPI_Comm *comm); + +to free the communicator allocated by ``pfft_create_procmesh_2d`` and + +:: + + void pfft_free(void *ptr); + +to free memory allocated by ``pfft_alloc_complex``. Finally, we exit MPI +via + +:: + + int MPI_Finalize(void); + +Porting FFTW-MPI based code to PFFT +----------------------------------- + +We illustrate the close connection between FFTW-MPI and PFFT at a +three-dimensional MPI example analogous to the example given in the FFTW +manual . + +Exactly the same task can be performed with PFFT as given in +Listing [lst:pfft\ :sub:`3`\ don1d]. + +:: + + #include + + int main(int argc, char **argv) + { + const ptrdiff_t n[3] = {..., ..., ...}; + pfft_plan plan; + pfft_complex *data; + ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k; + unsigned pfft_flags = 0; + + MPI_Init(&argc, &argv); + pfft_init(); + + /* get local data size and allocate */ + alloc_local = pfft_local_size_dft_3d(n, MPI_COMM_WORLD, pfft_flags, + local_ni, local_i_start, + local_no, local_o_start); + data = pfft_alloc_complex(alloc_local); + + /* create plan for in-place forward DFT */ + plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD, + PFFT_FORWARD, PFFT_ESTIMATE); + + /* initialize data to some function my_function(x,y,z) */ + for (i = 0; i < local_n[0]; ++i) + for (j = 0; j < n[1]; ++j) + for (k = 0; k < n[2]; ++k) + data[i*n[1]*n[2] + j*n[2] + k] = my_function(local_i_start[0] + i, j, k); + + /* compute transforms, in-place, as many times as desired */ + pfft_execute(plan); + + pfft_destroy_plan(plan); + + MPI_Finalize(); + } + +substitute ``fftw3-mpi.h`` by ``pfft.h`` + +substitute all prefixes ``fftw_`` and ``fftw_mpi_`` by ``pfft_`` + +substitute all prefixes ``FFTW_`` by ``PFFT_`` + +the integers ``N``, ``local_n0``, ``local_0_start`` become arrays of +length 3 + +``dft_`` in ``pfft_local_size_dft_3d`` + +``pfft_local_size_dft_3d`` has additional input ``pfft_flags`` and +additional outputs ``local_no``, ``local_o_start`` + +The loop that inits ``data`` becomes splitted along all three +dimensions. We could also use + +First, All prefixes ``fftw_`` are substituted by ``pfft_`` + +Now, the changes in order to use a two-dimensional process mesh are +marginal as can be seen in Listing [lst:pfft\ :sub:`3`\ don2d]. + +:: + + #include + + int main(int argc, char **argv) + { + const ptrdiff_t n[3] = {..., ..., ...}; + (red@const int np0 = ..., np1 = ...;@*) + pfft_plan plan; + pfft_complex *data; + ptrdiff_t alloc_local, local_ni[3], local_i_start[3], local_no[3], local_o_start[3], i, j, k; + unsigned pfft_flags = 0; + (red@MPI_Comm comm_cart_2d;@*) + + MPI_Init(&argc, &argv); + pfft_init(); + + (red@/* create two-dimensional process grid of size np0 x np1 */@*) + (red@pfft_create_procmesh_2d(MPI_COMM_WORLD, np0, np1,@*) + (red@&comm_cart_2d);@*) + + /* get local data size and allocate */ + alloc_local = pfft_local_size_dft_3d(n, (red@comm_cart_2d@*), pfft_flags, + local_ni, local_i_start, + local_no, local_o_start); + data = pfft_alloc_complex(alloc_local); + + /* create plan for in-place forward DFT */ + plan = pfft_plan_dft_3d(n, data, data, MPI_COMM_WORLD, + PFFT_FORWARD, PFFT_ESTIMATE); + + /* initialize data to some function my_function(x,y,z) */ + for (i = 0; i < local_n[0]; ++i) + for (j = 0; j < (red@local_n[1]@*); ++j) + for (k = 0; k < (red@local_n[2]@*); ++k) + data[i*(red@local_n[1]*local_n[2]@*) + j*(red@local_n[2]@*) + k] = + my_function(local_i_start[0] + i, + (red@local_i_start[1] +@*) j, + (red@local_i_start[2] +@*) k); + + /* compute transforms, in-place, as many times as desired */ + pfft_execute(plan); + + pfft_destroy_plan(plan); + + MPI_Finalize(); + } + +Errorcode for communicator creation +----------------------------------- + +As we have seen the function + +:: + + int pfft_create_procmesh_2d( + MPI_Comm comm, int np0, int np1, + MPI_Comm *comm_cart_2d); + +creates a two-dimensional, periodic, Cartesian communicator. The ``int`` +return value (not used in Listing [lst:man\ :sub:`c`\ 2c]) is the +forwarded error code of ``MPI_Cart_create``. It is equal to zero if the +communicator was created successfully. The most common error is that the +number of processes within the input communicator ``comm`` does not fit +``np0 x np1``. In this case the Cartesian communicator is not generated +and the return value is unequal to zero. Therefore, a typical sanity +check might look like + +:: + + /* Create two-dimensional process grid of size np[0] x np[1], + if possible */ + if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], + &comm_cart_2d) ) + { + pfft_fprintf(MPI_COMM_WORLD, stderr, + "Error: This test file only works with %d processes.\n", + np[0]*np[1]); + MPI_Finalize(); + return 1; + } + +Hereby, we use the PFFT library function + +:: + + void pfft_fprintf( + MPI_Comm comm, FILE *stream, const char *format, ...); + +to print the error message. This function is similar to the standard C +function ``fprintf`` with the exception, that only the process with MPI +rank :math:`0` within the given communicator ``comm`` will produce some +output; see Section [sec:fprintf] for details. + +Inplace transforms +------------------ + +Similar to FFTW, PFFT is able to compute parallel FFTs completely in +place, which means that beside some constant buffers, no second data +array is necessary. Especially, the global data communication can be +performed in place. As far as we know, there is no other parallel FFT +library beside FFTW and PFFT that supports this feature. This feature is +enabled as soon as the pointer to the output array ``out`` is equal to +the pointer to the input array ``in``. E.g., in +Listing [lst:man\ :sub:`c`\ 2c] we would call + +:: + + /* Plan parallel forward FFT */ + plan = pfft_plan_dft_3d(n, in, in, comm_cart_2d, + PFFT_FORWARD, PFFT_TRANSPOSED_NONE); + +Higher dimensional data decomposition +------------------------------------- + +The test program given in Listing [lst:man\ :sub:`c`\ 2c] used a +two-dimensional data decomposition of a three-dimensional data set. +Moreover, PFFT support the computation of any :math:`d`-dimensional FFT +with :math:`r`-dimensional data decomposition as long as +:math:`r\le d-1`. For example, one can use a one-dimensional data +decomposition for any two- or higher-dimensional data set, while the +data set must be at least four-dimensional to fit to a three-dimensional +data decomposition. The case :math:`r=d` is not supported efficiently, +since during the parallel computations there is always at least one +dimension that remains local, i.e., one dimensions stays non-decomposed. +The only exception from this rule is the case :math:`d=r=3` that is +supported by PFFT in a special way, see Section [sec:3don3d] for +details. + +The dimensionality of the data decomposition is given by the dimension +of the Cartesian communicator that goes into the PFFT planing routines. +Therefore, we present a generalization of communicator creation function + +:: + + int pfft_create_procmesh( + int rnk_np, MPI_Comm comm, const int *np, + MPI_Comm *comm_cart); + +Hereby, the array ``np`` of length ``rnk_np`` gives the size of the +Cartesian communicator ``cart_comm``. + +Parallel data decomposition +--------------------------- + +In the following, we use the notation :math:`\frac{n}{P}` to symbolize +that an array of length :math:`n` is broken into disjoint blocks and +distributed on :math:`P` MPI processes. Hereby, the data is distributed +in compliance to the FFTW-MPI data decompostion , i.e., the first +``P/block`` (rounded down) processes get a contiguous chunk of ``block`` +elements, the next process gets the remaining ``n - block * (n/block)`` +data elements, and all remaining processes get nothing. Thereby, the +block size ``block`` defaults to ``n/P`` (rounded down) but can also be +user defined. + +Non-transposed and transposed data layout +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the following, we use the notation :math:`\frac{n}{P}` to symbolize +that an array of length :math:`n` is distributed on :math:`P` MPI +processes. The standard PFFT data decomposition of :math:`h` interleaved +:math:`d`-dimensional arrays of equal size +:math:`n_0 \times n_1\times \dots \times n_{d-1}` on a +:math:`r`-dimensional process mesh of size +:math:`P_0\times \dots \times P_{r-1}` is given by the blocks + +.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \dots \times \frac{n_{r-1}}{P_{r-1}} \times n_r \times n_{r+1} \times \dots \times n_{d-1} \times h. + +A PFFT created with planning flag ``PFFT_TRANSPOSED_NONE`` requires the +inputs to be decomposed in this standard way and produces outputs that +are decomposed in the same way. + +PFFT can save half of the global communication amount, if the data +reordering to standard decomposition is omitted. The transposed data +decomposition is given by + +.. math:: \frac{n_1}{P_0} \times \frac{n_2}{P_1} \times \dots \times \frac{n_{r}}{P_{r-1}} \times n_0 \times n_{r+1} \times \dots \times n_{d-1} \times h + +A PFFT plan created with planning flag ``PFFT_TRANSPOSED_OUT`` produces +outputs with transposed data decomposition. Analogously, a PFFT plan +created with planning flag ``PFFT_TRANSPOSED_IN`` requires its inputs to +be decomposed in the transposed way. Typically, one creates a forward +plan with ``PFFT_TRANSPOSED_OUT`` and a backward plan with planning flag +``PFFT_TRANSPOSED_IN``. + +Note that the flags ``PFFT_TRANSPOSED_OUT`` and ``PFFT_TRANSPOSED_IN`` +must be passed to the array distribution function (see +Section [sec:local-size]) *as well as* to the planner (see +Section [sec:create-plan]). + +Three-dimensional FFTs with three-dimensional data decomposition +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many applications work with three-dimensional block decompositions of +three-dimensional arrays. PFFT supports decompositions of the kind + +.. math:: \frac{n_0}{P_0} \times \frac{n_1}{P_1} \times \frac{n_2}{P_2} \times h. + +However, PFFT applies a parallel algorithms that needs at least one +non-distributed transform dimension (we do not transform along +:math:`h`), Therefore, we split the number of processes along the last +dimension into two factors :math:`P_2=Q_1Q_2`, remap the data to the +two-dimensional decomposition + +.. math:: \frac{n_0}{P_0Q_0} \times \frac{n_1}{P_1Q_1} \times n_2 \times h, + +and compute the parallel FFT with this two-dimensional decomposition. +Note that the 3d to 2d remap implies some very special restrictions on +the block sizes for :math:`n_0` and :math:`n_1`, i.e., the blocks must +be divisible by :math:`Q_0` and :math:`Q_1`. More precisely, the default +blocks of the 2d-decomposition are given by ``n0/(P0*Q0)`` and +``n1/(P1*Q1)`` (both divisions rounded down). This implies that the +default blocks of the 3d-decomposition must be ``n0/(P0*Q0) * Q0``, +``n1/(P1*Q1) * Q1``, and ``n2/(Q0*Q1)`` (all divisions rounded down). + +Planning effort +--------------- + +Pass one of the following flags + +``PFFT_ESTIMATE``, + +``PFFT_MEASURE``, + +``PFFT_PATIENT``, or, + +``PFFT_EXHAUSIVE`` + +to the PFFT planner in order to plan all internal FFTW plans with +``FFTW_ESTIMATE``, ``FFTW_MEASURE``, ``FFTW_PATIENT``, or +``FFTW_EXHAUSIVE``, respectively. The default value is ``PFFT_MEASURE``. + +PFFT uses FFTW plans for parallel array transposition and the serial +transforms. In fact, every serial transform is a combination of strided +lower-dimensional FFTs and a serial array transposition (necessary to +prepare the global transposition) which can be done by a single FFTW +plan. However, it turns out that FFTW sometimes performs better if the +serial transposition and the strided FFTs are executed separately. +Therefore, PFFT introduces the flag ``PFFT_TUNE`` that enables extensive +run time tests in order to find the optimal sequence of serial strided +FFT and serial transposition for every serial transform. These tests are +disable on default which corresponds to the flag ``PFFT_NO_TUNE``. + +Preserving input data +--------------------- + +The following flags + +``PFFT_PRESERVE_INPUT``, + +``PFFT_DESTROY_INPUT``, and, + +``PFFT_BUFFERED_INPLACE`` + +only take effect for out-of-place transforms. The first one behaves +analogously to the FFTW flag ``FFTW_PRESERVE_INPUT`` and ensures that +the input values are not overwritten. In fact, this flag implies that +only the first serial transform is executed out-of-place and all +successive steps are performed in-place on the output array. In +compliance to FFTW, this is the default behaviour for out-of-place +plans. + +The second flag behaves analogously to the FFTW flag +``FFTW_DESTROY_INPUT`` and tells the planner that the input array can be +used as scratch array. This may give some speedup for out-of-place +plans, because all the intermediate transforms and transposition steps +can be performed out-of-place. + +Finally, the flag ``PFFT_BUFFERED_INPLACE`` can be used for out-of-place +plans that store its inputs and outputs in the same array, i.e., array +``out`` is used for intermediate out-of-place transforms and +transpositions but the PFFT inputs and outputs are stored in array +``in``. + +FFTs with shifted index sets +---------------------------- + +``PFFT_SHIFTED_IN`` + +``PFFT_SHIFTED_OUT`` + +Pruned FFT and Shifted Index Sets +--------------------------------- + +Pruned FFT +~~~~~~~~~~ + +For pruned r2r- and c2c-FFT are defined as + +.. math:: g_l = \sum_{k=0}^{n_i-1} \hat g_k {{\mathrm{e}^{-2\pi{{{\text{i}}}} kl/n}}}, \quad l=0,\dots,n_o-1, + +where :math:`n_i\le n` and :math:`n_o\le n`. + +Shifted Index Sets +~~~~~~~~~~~~~~~~~~ + +For :math:`N\in 2{{\mathbb{N}}}` we define the FFT with shifted inputs + +For :math:`K,L,N\in 2{{\mathbb{N}}}`, :math:`L`` header file. + +Replace all lowercase instances of ‘``pfft_``’ with ‘``pfftf_``’ or +‘``pfftl_``’ for single or long-double precision, respectively. +(``pfft_complex`` becomes ``pfftf_complex``, ``pfft_execute`` becomes +``pfftf_execute``, etcetera.) + +Uppercase names, i.e. names beginning with ‘``PFFT_``’, remain the same. + +Replace ``double`` with ``float`` or ``long double`` for subroutine +parameters. + +Ghost cell communication +------------------------ + +Fortran interface +----------------- +