Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions bio_bits/make_pcoa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
'''
Usage: make_pcoa.py <fasta> [--map <mapfile>] [--outdir <DIR>] [--coord <coordfile>]

Options:
--outdir=<DIR>,-o=<DIR> Directory to put html file in. [Default: pcoa]
--map=<mapfile>,-m=<mapfile> TSV file which maps FASTA IDs to metadata. If not supplied one is generated using the FASTA IDs only.
--coord=<coordfile>,-c=<coordfile> Coordinate file including distance matrix, defined by Qiime pipeline. Will over-ride the information in

Help:
After running, open the resulting index.html file in your browser. i.e.:
$ make_pcoa aln.fasta --outdir pcoa
$ firefox pcoa/index.html
'''

from __future__ import print_function

#options in docopt are special, and need = if using them
''' also create static png files.
allow "color-by" parameters. '''
from docopt import docopt
from schema import Schema, Use, Optional
import os
import sh
try:
from skbio import Alignment
from skbio.stats.ordination import PCoA
import emperor #!not used, but emperor must be installed to run `make_emperor.py`
run_emperor = sh.Command('make_emperor.py')
except ImportError:
print("make_pcoa requires emperor and scikit-bio!\nExecute `pip install emperor` to use.")




def make_coordinates(fasta_filename):
alignment = Alignment.read(fasta_filename)
distance_matrix = alignment.distances()
pcoa = PCoA(distance_matrix)
scores = pcoa.scores()
return scores

def write_coordiates(fasta_filename):
outname = '%s.coord' % fasta_filename
assert not os.path.exists(outname), "Coordinate file %s exists! Please remove or run again with --coord parameter." % outname
print("Generating Coordinate file %s from fasta file %s" % (outname, fasta_filename))
make_coordinates(fasta_filename).write(outname)
return outname

def make_emperor(fasta_fn, outdir, mapfile, coordfile):
# do try/except for sh call to make_emperor
mapfile = mapfile or make_simple_mapping(fasta_fn)
coordinate_file = coordfile or write_coordiates(fasta_fn)
return run_emperor(i=coordinate_file, m=mapfile, o=outdir)

def make_simple_mapping(fasta_fn):
ids = map(lambda x: x[1:], filter(lambda x: x.startswith('>'), open(fasta_fn)))
header = '#SampleID\n'
mapfile_fn = '%s.map' % fasta_fn
assert not os.path.exists(mapfile_fn), "Mapping file %s exists! Please remove, or run again with --map parameter." % mapfile_fn
print("Auto-generating map file %s from fasta file %s" % (mapfile_fn, fasta_fn))
with open(mapfile_fn, 'w') as mapfile:
mapfile.write(header)
mapfile.writelines(ids)
return mapfile_fn

#NOTE: Currently unused
'''
def make_undescore_metadata_mapping(fasta):
import re
reg = re.compile(r'^[^_]+_([^_]+)_')
with open('%s.map' % fasta, 'w') as mapfile:
ids = map(X[1:], filter(X[0] == '>', open(fasta)))
#groups = groupby(ids, lambda x: x[:x.find('_')])
groups = groupby(ids, lambda x: reg.match(x).groups()[0])
header = '#Group\tSampleID\n'
mapfile.write(header)
for k, group in groups:
mapfile.writelines(map(('%s\t' % k + '{0}').format, group))
return mapfile.name
'''
def main():
scheme = Schema(
{ '<fasta>' : os.path.isfile,
Optional('--map') : Use(lambda x: x or None),
Optional('--coord') : Use(lambda x: x or None),
'--outdir' : str
})
raw_args = docopt(__doc__, version='Version 1.0')
args = scheme.validate(raw_args)
make_emperor( args['<fasta>'], args['--outdir'], args['--map'], args['--coord'])

if __name__ == '__main__': main()
1 change: 1 addition & 0 deletions docs/scripts/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ Scripts
degen_regions
plot_muts
fasta
make_pcoa
58 changes: 58 additions & 0 deletions docs/scripts/make_pcoa.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
make_pcoa
=======

make_pcoa is used to build an interactive 3D plot for a given fasta alignment file built from
a Principal Coordinate Analysis (PCoA). The file produced can be opened in a browser for interactive viewing.

makce_pcoa uses the scikit-bio package to compute the PCoA and the emperor program to create the plot.

Usage
+++++++++++++++++++++++++++


Usage:
make_pcoa.py <fasta> [--map <mapfile>] [--outdir <DIR>] [--coord <coordfile>]

Options:
--outdir=<DIR>,-o=<DIR> Directory to put html file in. [Default: emperor]
--map=<mapfile>,-m=<mapfile> TSV file which maps FASTA IDs to metadata. If not supplied one is generated using the FASTA IDs only.
--coord=<coordfile>,-c=<coordfile> Coordinate file including distance matrix, defined by Qiime pipeline

Examples:

.. code-block:: bash

$> make_pcoa tests/testinput/aln1.fasta --map mymap.map --outdir results

$> make_pcoa tests/testinput/aln1.fasta --outdir results

$> make_pcoa tests/testinput/aln1.fasta --map mymap.map --coord mycoord.coord


Produces a folder results (named "pcoa" if --outdir is not provided) condaining an index.html file. The plot can be viewed by opening it in the browser:

.. code-block:: bash

$> firefox pcoa/index.html



Mapping File
+++++++++++++

Mapping files are used to customize the plot (e.g., color grouping) by defining categories for the provided sequences. Mapping files are simple TSV-seperated files which
map FASTA IDs (from the input file e.g. aln1.fasta) to arbitrary categories; i.e. geographic region of sequence, sequencing platform, etc.

Information about mapping files can be found here: `http://qiime.org/documentation/file_formats.html#metadata-mapping-files`_
An example is located here: `http://qiime.org/_static/Examples/File_Formats/Example_Mapping_File.txt`_

In general, a mapping file is suggested to assist in interpreting the data, but if one is not provided, make_pcoa will create one automatically. This auto-generated mapping file will have no categories besides FASTA ID.


Coordinate File
+++++++++++++++

Coordinate files include the data generated by PCoA. In general, make_pcoa should create this file automatically; but the --coord argument can be
supplied to avoid re-creating a new coordinate file each time (if, for example, a new mapping file is provided with the same alignment).

NOTE: If a coordinate file is provided, the fasta file is ignored: the coordinate file provides the information to create the plot (unless it is used to auto-generate the mapping file, which will effect metadata only).
2 changes: 2 additions & 0 deletions pcoa-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
emperor
scikit-bio
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'degen = bio_bits.degen:main',
'plot_muts = bio_bits.plot_muts:main',
'fasta = bio_bits.fasta:main',
'make_pcoa = bio_pieces.make_pcoa:main',
#'sequence_concat = bio_bits.sequence_concat:main',
#'sequence_files_concat = bio_bits.sequence_files_concat:main',
#'sequence_split = bio_bits_old.sequence_split:main',
Expand Down
17 changes: 17 additions & 0 deletions tests/make_pcoa.robot.dontrun.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
*** Settings ***
Library Process
Library OperatingSystem
Library Collections
Suite Teardown Terminate All Processes

*** Variables ***
${fasta} = tests/testinput/short.aln1.fasta

*** Test Cases ***
TestMakePCAReturnCodeIsZero
${process_result} = Run Process make_pca ${fasta}
Log To Console ${process_result.stdout}
Log To Console ${process_result.stderr}
Should Be Equal As Integers ${process_result.rc} 0
# Check output
Should Not Contain ${process_result.stdout} Error
Loading