diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 94301b0..e008874 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog =========== +2.4.2 +------ +* Updated `setup.py` dependencies. + +* `scipy` 1.15.0 has deprecated and removed `derivative` function with no direct replacement. For now, function has been imported into the project under `phydmslib/scipy_derivative.py`. + +* Updated `phydms_tutorial.Rmd` to python3 syntax. + 2.4.1 ----- * Unpin `biopython` now that `pyvolve` is updated. diff --git a/phydmslib/_metadata.py b/phydmslib/_metadata.py index 2b87b3a..9aa666b 100644 --- a/phydmslib/_metadata.py +++ b/phydmslib/_metadata.py @@ -1,4 +1,4 @@ -__version__ = '2.4.1' +__version__ = '2.4.2' __author__ = 'the Bloom lab (see https://github.com/jbloomlab/phydms/contributors)' __url__ = 'http://jbloomlab.github.io/phydms' __author_email__ = 'jbloom@fredhutch.org' diff --git a/phydmslib/file_io.py b/phydmslib/file_io.py index 4bfe94b..919e738 100644 --- a/phydmslib/file_io.py +++ b/phydmslib/file_io.py @@ -219,7 +219,7 @@ def readPrefs(prefsfile, minpref=0, avgprefs=False, randprefs=False, prefs[r] = {} for aa in df.columns: if aa != 'site': - prefs[r][aa] = float(rdf[aa]) + prefs[r][aa] = float(rdf[aa].iloc[0]) else: # try reading as dms_tools format prefs = phydmslib.file_io.readPrefs_dms_tools_format(prefsfile)[2] diff --git a/phydmslib/models.py b/phydmslib/models.py index 4fcda2b..5c85eb8 100644 --- a/phydmslib/models.py +++ b/phydmslib/models.py @@ -26,6 +26,7 @@ INDEX_TO_NT, CODON_NT_MUT, CODON_NT_COUNT, CODON_TO_AA, INDEX_TO_AA, AA_TO_INDEX, CODON_NT) +from phydmslib.scipy_derivative import derivative warnings.simplefilter('always') warnings.simplefilter('ignore', ImportWarning) @@ -571,7 +572,7 @@ def M(self, t, tips=None, gaps=None): * expD).swapaxes(1, 0), broadcastGetCols(self.Ainv, tips)) - if gaps is not None: + if gaps is not None and gaps.size > 0: M[gaps] = numpy.ones(N_CODON, dtype='float') # if M.min() < -0.01: # warnings.warn("Large negative value in M(t) being set to 0. " @@ -598,7 +599,7 @@ def dM(self, t, param, Mt, tips=None, gaps=None): else: dM_param = broadcastMatrixVectorMultiply(self.Prxy, Mt, alpha=alpha) - if gaps is not None: + if gaps is not None and gaps.size > 0: dM_param[gaps] = numpy.zeros(N_CODON, dtype='float') return dM_param @@ -663,7 +664,7 @@ def dM(self, t, param, Mt, tips=None, gaps=None): for j in range(paramlength): dM_param[j] = (broadcastMatrixVectorMultiply(self.A, broadcastGetCols(broadcastMatrixMultiply(self.B[param][j] * V, self.Ainv), tips))) # noqa: E501 - if gaps is not None: + if gaps is not None and gaps.size > 0: if not paramisvec: dM_param[gaps] = numpy.zeros(N_CODON, dtype='float') else: @@ -1338,10 +1339,10 @@ def _update_phi(self): _checkParam('phi', self.phi, self.PARAMLIMITS, self.PARAMTYPES) self._eta_from_phi() dbeta = 1.0e-3 - self.dphi_dbeta = scipy.misc.derivative(self._compute_empirical_phi, + self.dphi_dbeta = derivative(self._compute_empirical_phi, self.beta, dx=dbeta, n=1, order=5) - dphi_dbeta_halfdx = scipy.misc.derivative(self._compute_empirical_phi, + dphi_dbeta_halfdx = derivative(self._compute_empirical_phi, self.beta, dx=dbeta / 2, n=1, order=5) assert numpy.allclose(self.dphi_dbeta, dphi_dbeta_halfdx, atol=1e-5, @@ -1793,7 +1794,7 @@ def M(self, t, tips=None, gaps=None): newM = numpy.zeros((len(tips), N_CODON)) for i in range(len(tips)): newM[i] = (M[0][:, tips[i]]) - if gaps is not None: + if gaps is not None and gaps.size > 0: newM[gaps] = numpy.ones(N_CODON, dtype='float') return newM @@ -1820,7 +1821,7 @@ def dM(self, t, param, Mt, tips=None, gaps=None): dM_param = (broadcastMatrixVectorMultiply( numpy.tile(self.Pxy[0], (self.nsites, 1, 1)), Mt, alpha=alpha)) - if gaps is not None: + if gaps is not None and gaps.size > 0: dM_param[gaps] = numpy.zeros(N_CODON, dtype='float') return dM_param @@ -1862,7 +1863,7 @@ def dM(self, t, param, Mt, tips=None, gaps=None): newdM_param = numpy.zeros((len(tips), N_CODON)) for i in range(len(tips)): newdM_param[i] = (dM_param[0][:, tips[i]]) - if gaps is not None: + if gaps is not None and gaps.size > 0: newdM_param[gaps] = numpy.zeros(N_CODON, dtype='float') return newdM_param @@ -2188,11 +2189,10 @@ def f_beta(beta): for (param, f) in [('alpha_lambda', f_alpha), ('beta_lambda', f_beta)]: pvalue = getattr(self, param) - dparam = scipy.misc.derivative(f, pvalue, dx, n=1, order=5) + dparam = derivative(f, pvalue, dx, n=1, order=5) assert dparam.shape == (self.ncats,) for stepchange in [0.5, 2]: # make sure robust to step size - dparam2 = scipy.misc.derivative(f, pvalue, stepchange * dx, - n=1, order=5) + dparam2 = derivative(f, pvalue, stepchange * dx, n=1, order=5) assert (numpy.allclose(dparam, dparam2, atol=1e-5, rtol=1e-4)), ( "Numerical derivative of {0} at {1} differs for " diff --git a/phydmslib/scipy_derivative.py b/phydmslib/scipy_derivative.py new file mode 100644 index 0000000..864a08e --- /dev/null +++ b/phydmslib/scipy_derivative.py @@ -0,0 +1,194 @@ +""" scipy derivative functions + +Imported from scipy v1.11.0 -- has been deprecated and removed from scipy. +""" + +from numpy import arange, newaxis, hstack, prod, array + + +def derivative(func, x0, dx=1.0, n=1, args=(), order=3): + """ + Find the nth derivative of a function at a point. + + Given a function, use a central difference formula with spacing `dx` to + compute the nth derivative at `x0`. + + .. deprecated:: 1.10.0 + `derivative` has been deprecated from `scipy.misc.derivative` + in SciPy 1.10.0 and it will be completely removed in SciPy 1.12.0. + You may consider using + findiff: https://github.com/maroba/findiff or + numdifftools: https://github.com/pbrod/numdifftools + + Parameters + ---------- + func : function + Input function. + x0 : float + The point at which the nth derivative is found. + dx : float, optional + Spacing. + n : int, optional + Order of the derivative. Default is 1. + args : tuple, optional + Arguments + order : int, optional + Number of points to use, must be odd. + + Notes + ----- + Decreasing the step size too small can result in round-off error. + + Examples + -------- + >>> def f(x): + ... return x**3 + x**2 + >>> float(derivative(f, 1.0, dx=1e-6)) + 4.9999999999217... + + """ + return _derivative(func, x0, dx, n, args, order) + + +def _central_diff_weights(Np, ndiv=1): + """ + Return weights for an Np-point central derivative. + + Assumes equally-spaced function points. + + If weights are in the vector w, then + derivative is w[0] * f(x-ho*dx) + ... + w[-1] * f(x+h0*dx) + + Parameters + ---------- + Np : int + Number of points for the central derivative. + ndiv : int, optional + Number of divisions. Default is 1. + + Returns + ------- + w : ndarray + Weights for an Np-point central derivative. Its size is `Np`. + + Notes + ----- + Can be inaccurate for a large number of points. + + Examples + -------- + We can calculate a derivative value of a function. + + >>> def f(x): + ... return 2 * x**2 + 3 + >>> x = 3.0 # derivative point + >>> h = 0.1 # differential step + >>> Np = 3 # point number for central derivative + >>> weights = _central_diff_weights(Np) # weights for first derivative + >>> vals = [f(x + (i - Np/2) * h) for i in range(Np)] + >>> float(sum(w * v for (w, v) in zip(weights, vals))/h) + 11.79999999999998... + + This value is close to the analytical solution: + f'(x) = 4x, so f'(3) = 12 + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Finite_difference + + """ + if Np < ndiv + 1: + raise ValueError( + "Number of points must be at least the derivative order + 1." + ) + if Np % 2 == 0: + raise ValueError("The number of points must be odd.") + from scipy import linalg + + ho = Np >> 1 + x = arange(-ho, ho + 1.0) + x = x[:, newaxis] + X = x**0.0 + for k in range(1, Np): + X = hstack([X, x**k]) + w = prod(arange(1, ndiv + 1), axis=0) * linalg.inv(X)[ndiv] + return w + + +def _derivative(func, x0, dx=1.0, n=1, args=(), order=3): + """ + Find the nth derivative of a function at a point. + + Given a function, use a central difference formula with spacing `dx` to + compute the nth derivative at `x0`. + + Parameters + ---------- + func : function + Input function. + x0 : float + The point at which the nth derivative is found. + dx : float, optional + Spacing. + n : int, optional + Order of the derivative. Default is 1. + args : tuple, optional + Arguments + order : int, optional + Number of points to use, must be odd. + + Notes + ----- + Decreasing the step size too small can result in round-off error. + + Examples + -------- + >>> def f(x): + ... return x**3 + x**2 + >>> float(_derivative(f, 1.0, dx=1e-6)) + 4.9999999999217... + + """ + if order < n + 1: + raise ValueError( + "'order' (the number of points used to compute the derivative), " + "must be at least the derivative order 'n' + 1." + ) + if order % 2 == 0: + raise ValueError( + "'order' (the number of points used to compute the derivative) " + "must be odd." + ) + # pre-computed for n=1 and 2 and low-order for speed. + if n == 1: + if order == 3: + weights = array([-1, 0, 1]) / 2.0 + elif order == 5: + weights = array([1, -8, 0, 8, -1]) / 12.0 + elif order == 7: + weights = array([-1, 9, -45, 0, 45, -9, 1]) / 60.0 + elif order == 9: + weights = array([3, -32, 168, -672, 0, 672, -168, 32, -3]) / 840.0 + else: + weights = _central_diff_weights(order, 1) + elif n == 2: + if order == 3: + weights = array([1, -2.0, 1]) + elif order == 5: + weights = array([-1, 16, -30, 16, -1]) / 12.0 + elif order == 7: + weights = array([2, -27, 270, -490, 270, -27, 2]) / 180.0 + elif order == 9: + weights = ( + array([-9, 128, -1008, 8064, -14350, 8064, -1008, 128, -9]) + / 5040.0 + ) + else: + weights = _central_diff_weights(order, 2) + else: + weights = _central_diff_weights(order, n) + val = 0.0 + ho = order >> 1 + for k in range(order): + val += weights[k] * func(x0 + (k - ho) * dx, *args) + return val / prod((dx,) * n, axis=0) diff --git a/phydmslib/simulate.py b/phydmslib/simulate.py index 5dfc51d..f5d534e 100644 --- a/phydmslib/simulate.py +++ b/phydmslib/simulate.py @@ -120,9 +120,7 @@ def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False): file with the name having the prefix giving by `alignmentPrefix` and the suffix `'_simulatedalignment.fasta'`. """ - if randomSeed is False: - pass - else: + if randomSeed: random.seed(randomSeed) # Transform the branch lengths by dividing by the model `branchScale` @@ -146,7 +144,10 @@ def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False): info = '_temp_{0}info.txt'.format(alignmentPrefix) rates = '_temp_{0}_ratefile.txt'.format(alignmentPrefix) evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve_tree) - evolver(seqfile=alignment, infofile=info, ratefile=rates) + if randomSeed: + evolver(seqfile=alignment, infofile=info, ratefile=rates, seed=randomSeed) + else: + evolver(seqfile=alignment, infofile=info, ratefile=rates) for f in [rates, info, "custom_matrix_frequencies.txt"]: if os.path.isfile(f): os.remove(f) diff --git a/phydmslib/treelikelihood.py b/phydmslib/treelikelihood.py index 0544226..054de97 100644 --- a/phydmslib/treelikelihood.py +++ b/phydmslib/treelikelihood.py @@ -300,7 +300,7 @@ def __init__(self, tree, alignment, model, underflowfreq=3, self.descendants.append([self.name_to_nodeindex[nx] for nx in node.find_clades() if nx != node]) self.name_to_nodeindex[node] = n - self.gaps = numpy.array(self.gaps) + self.gaps = numpy.array(self.gaps, dtype=object) assert len(self.gaps) == self.ntips # _index_to_param defines internal mapping of diff --git a/phydmslib/weblogo.py b/phydmslib/weblogo.py index 8fbb631..7910e54 100644 --- a/phydmslib/weblogo.py +++ b/phydmslib/weblogo.py @@ -474,7 +474,7 @@ def LogoPlot(sites, datatype, data, plotfile, nperline, # create web logo charstring = ''.join(chars_for_string) - assert len(charstring) == len(chars_for_string),\ + assert len(charstring) == len(chars_for_string), \ ("Length of charstring doesn't match length of " "chars_for_string. Do you have unallowable multi-letter " "characters?\n%s" @@ -741,7 +741,7 @@ def format_color(color): s.append((s_d[aa], aa)) else: s.append((s_d[aa], ' ')) -# s = [(s_d[aa], aa) for aa in ordered_alphabets[seq_index]] + # s = [(s_d[aa], aa) for aa in ordered_alphabets[seq_index]] # Sort by frequency. If equal frequency then reverse alphabetic # (So sort reverse alphabetic first, then frequencty) @@ -889,8 +889,8 @@ def read_transfac(fin, alphabet=None): hcols = len(header) rows = len(items) cols = len(items[0]) - if not(header[0] == 'PO' or header[0] == 'P0' or - hcols == cols-1 or hcols == cols-2): + if not (header[0] == 'PO' or header[0] == 'P0' or + hcols == cols-1 or hcols == cols-2): raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? @@ -1055,8 +1055,8 @@ def LogoOverlay(sites, overlayfile, overlay, nperline, sitewidth, rmargin, for (prop_d, shortname, longname) in overlay: if shortname == longname == 'wildtype': assert all(((isinstance(prop, str) and len(prop) == 1) for - prop in prop_d.values())),\ - 'prop_d does not give letters' + prop in prop_d.values())), \ + 'prop_d does not give letters' proptype = 'wildtype' (vmin, vmax) = (0, 1) # not used, but need to be assigned propcategories = None # not used, but needs to be assigned diff --git a/scripts/phydms b/scripts/phydms index 63ce803..02b8efe 100644 --- a/scripts/phydms +++ b/scripts/phydms @@ -229,7 +229,7 @@ def main(): seqnames = set([head for (head, seq) in alignment]) # process the substitution model - yngkp_match = re.compile('^YNGKP_(?PM\d+)$') + yngkp_match = re.compile(r'^YNGKP_(?PM\d+)$') if (isinstance(args['model'], str) and yngkp_match.search(args['model'])): for argname in ['randprefs', 'avgprefs', diff --git a/scripts/phydms_comprehensive b/scripts/phydms_comprehensive index 78cc1f4..be38fff 100644 --- a/scripts/phydms_comprehensive +++ b/scripts/phydms_comprehensive @@ -155,7 +155,7 @@ def main(): filesuffixlist = (copy.deepcopy(filesuffixlist) + ['_diffprefsbysite.txt']) for prefsfile in args['prefsfiles']: - if re.search('\s', prefsfile): + if re.search(r'\s', prefsfile): raise ValueError("There is a space in the preferences file name:\ {0}".format(prefsfile)) prefsfilebase = os.path.splitext(os.path.basename(prefsfile))[0] diff --git a/scripts/phydms_divpressure b/scripts/phydms_divpressure index 83f64d1..367897a 100644 --- a/scripts/phydms_divpressure +++ b/scripts/phydms_divpressure @@ -239,7 +239,7 @@ def main(): # set up the ExpCM without diversifying pressure expcmadditionalcmds = [] nempiricalExpCM = 3 - if re.search('\s', args['prefsfile']): + if re.search(r'\s', args['prefsfile']): raise ValueError("There is a space in the preferences file name:\ {0}".format(args['prefsfile'])) prefsfilebase = os.path.splitext(os.path.basename(args['prefsfile']))[0] diff --git a/scripts/phydms_logoplot b/scripts/phydms_logoplot index a8e35c6..7366cf6 100644 --- a/scripts/phydms_logoplot +++ b/scripts/phydms_logoplot @@ -75,7 +75,7 @@ def main(): data[r] = {} sitediffprefs = diffprefs[diffprefs['site'].astype(str) == r] for a in aas: - data[r][a] = float(sitediffprefs['dpi_{0}'.format(a)]) + data[r][a] = float(sitediffprefs['dpi_{0}'.format(a)].iloc[0]) sumpos = sum([dpi for dpi in data[r].values() if dpi > 0]) sumneg = sum([-dpi for dpi in data[r].values() if dpi < 0]) if max(sumpos, sumneg) >= ydatamax: @@ -97,9 +97,9 @@ def main(): assert set(sites) == set(map(str, omegas['site'].values)),\ ("sites in {0} don't match those in prefs or diffprefs" .format(args['omegabysite'])) - shortname = '$\omega_r$' - longname = ('{0} $<1 \; \longleftarrow$ $\log_{{10}} P$ ' - '$\longrightarrow \;$ {0} $>1$'.format(shortname)) + shortname = r'$\omega_r$' + longname = (r'{0} $<1 \; \longleftarrow$ $\log_{{10}} P$ ' + r'$\longrightarrow \;$ {0} $>1$'.format(shortname)) prop_d = {} for r in sites: omegar = float(omegas[omegas['site'].astype(str) == r]['omega']) diff --git a/setup.py b/setup.py index e60f026..fde120d 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def extensions(): Extension('phydmslib.numutils', ['phydmslib/numutils.pyx'], include_dirs=[numpy.get_include()], extra_compile_args=['-Wno-unused-function']), - ] + ] return cythonize(ext) # main setup command @@ -79,21 +79,21 @@ def extensions(): long_description = readme, license = 'GPLv3', setup_requires = [ - 'cython>=0.28', - 'numpy>=1.11', + 'cython>=3.0', + 'numpy>=2.2', ], install_requires = [ - 'biopython>=1.67', - 'cython>=0.28', - 'numpy>=1.16.5', - 'scipy>=0.18', - 'matplotlib>=2.0.2', - 'natsort>=5.0.1', - 'sympy>=1.0', - 'six>=1.10', - 'pandas>=0.20.2', - 'pyvolve>=1.0.3', - 'statsmodels>=0.8', + 'biopython>=1.85', + 'cython>=3.0', + 'numpy>=2.2', + 'scipy>=1.15', + 'matplotlib>=3.10', + 'natsort>=8.4', + 'sympy>=1.13', + 'six>=1.17', + 'pandas>=2.2', + 'pyvolve>=1.1', + 'statsmodels>=0.14', 'weblogo>=3.4, <3.6', 'PyPDF2>=1.26', ], diff --git a/tests/test_omegabysite.py b/tests/test_omegabysite.py index c31105e..7e1cf59 100644 --- a/tests/test_omegabysite.py +++ b/tests/test_omegabysite.py @@ -52,23 +52,31 @@ def initializeModel(self): def test_OnSimulatedData(self): """Test on Simulated Data.""" - random.seed(1) + seed = 1 + random.seed(seed) divpressuresites = random.sample(range(self.nsites), 5) partitions = phydmslib.simulate.pyvolvePartitions( self.model, (200.0, divpressuresites)) + evolver = pyvolve.Evolver( - partitions=partitions, tree=pyvolve.read_tree(file=self.tree)) + partitions=partitions, + tree=pyvolve.read_tree(file=self.tree), + seed=seed, + ) + simulateprefix = os.path.join(self.outdir, self.modelname) simulatedalignment = simulateprefix + "_simulatedalignment.fasta" info = simulateprefix + "_temp_info.txt" rates = simulateprefix + "_temp_ratefile.txt" evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates) + subprocess.check_call(["phydms", simulatedalignment, self.tree, self.modelarg, simulateprefix, "--omegabysite", "--brlen", "scale"]) omegabysitefile = simulateprefix + "_omegabysite.txt" omegas = pandas.read_csv(omegabysitefile, sep="\t", comment="#") divpressureomegas = omegas[omegas["site"].isin(divpressuresites)] + self.assertTrue(len(divpressureomegas) == len(divpressuresites)) self.assertTrue( (divpressureomegas["omega"].values > 2).all(), diff --git a/tests/test_treelikelihood_fitprefs.py b/tests/test_treelikelihood_fitprefs.py index b1fad73..5ee04a5 100644 --- a/tests/test_treelikelihood_fitprefs.py +++ b/tests/test_treelikelihood_fitprefs.py @@ -26,8 +26,9 @@ class test_TreeLikelihood_ExpCM_fitprefs(unittest.TestCase): def setUp(self): """Set up for tests.""" - numpy.random.seed(1) - random.seed(1) + seed = 1 + numpy.random.seed(seed) + random.seed(seed) nsites = 1 minpref = 0.001 @@ -64,7 +65,8 @@ def setUp(self): # simulate alignment using realmodel evolver = pyvolve.Evolver( partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel), - tree=pyvolve.read_tree(file=treefile)) + tree=pyvolve.read_tree(file=treefile), + seed=seed) alignmentfile = "_temp_fitprefs_simulatedalignment.fasta" info = "_temp_info.txt" rates = "_temp_ratefile.txt" diff --git a/tutorial/phydms_tutorial.Rmd b/tutorial/phydms_tutorial.Rmd index 9e77b23..6281739 100644 --- a/tutorial/phydms_tutorial.Rmd +++ b/tutorial/phydms_tutorial.Rmd @@ -10,35 +10,35 @@ output: code_folding: hide --- -In this tutorial you will learn the basics of [`phydms`](http://jbloomlab.github.io/phydms/index.html). We will walk through examples of analyses you may want to run to compare your deep mutational scanning data to natural sequence evolution. For more details on any of the steps, please see the [phydms documentation](http://jbloomlab.github.io/phydms/index.html). +In this tutorial you will learn the basics of [`phydms`](http://jbloomlab.github.io/phydms/index.html). We will walk through examples of analyses you may want to run to compare your deep mutational scanning data to natural sequence evolution. For more details on any of the steps, please see the [phydms documentation](http://jbloomlab.github.io/phydms/index.html). [`phydms`](http://jbloomlab.github.io/phydms/index.html) was developed at the [Bloom Lab](http://research.fhcrc.org/bloom/en.html) ([full list of contributors](https://github.com/jbloomlab/phydms/graphs/contributors)) -#Comparing your `DMS` data to natural sequence evolution -After you perform your deep mutational scanning experiment, you may want to know how well the experimental measurements you made in the lab describe the natural sequence evolution of your protein. Using [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) you can compare the phylogenetic substitution model [*ExpCM*](https://academic.oup.com/mbe/article/31/10/2753/1015257/An-Experimentally-Informed-Evolutionary-Model), which takes into account the site-specific amino-acid preferences from your deep mutational scanning experiment, to traditional, non-site specific models from the [*YNGKP*](http://www.genetics.org/content/155/1/431) family. [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) will run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) in several different modes to generate results for the appropriate comparisons. +#Comparing your `DMS` data to natural sequence evolution +After you perform your deep mutational scanning experiment, you may want to know how well the experimental measurements you made in the lab describe the natural sequence evolution of your protein. Using [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) you can compare the phylogenetic substitution model [*ExpCM*](https://academic.oup.com/mbe/article/31/10/2753/1015257/An-Experimentally-Informed-Evolutionary-Model), which takes into account the site-specific amino-acid preferences from your deep mutational scanning experiment, to traditional, non-site specific models from the [*YNGKP*](http://www.genetics.org/content/155/1/431) family. [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) will run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) in several different modes to generate results for the appropriate comparisons. -For the standard [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) analysis, you will need [amino-acid preferences](#prefs) from your deep mutational scanning experiment and a codon-level sequence [alignment](#seqs) of your gene. [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) will then +For the standard [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) analysis, you will need [amino-acid preferences](#prefs) from your deep mutational scanning experiment and a codon-level sequence [alignment](#seqs) of your gene. [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) will then -• infer a tree using [`RAxML`](http://sco.h-its.org/exelixis/software.html) -• run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) with the *YNGKP_M0*, the *YNKGP_M5*, the *ExpCM*, and a control *ExpCM* run with averaged preferences -• summarize the results +• infer a tree using [`RAxML`](http://sco.h-its.org/exelixis/software.html) +• run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) with the *YNGKP_M0*, the *YNKGP_M5*, the *ExpCM*, and a control *ExpCM* run with averaged preferences +• summarize the results -See the [full documentation](http://jbloomlab.github.io/phydms/index.html) for specifics on these models. +See the [full documentation](http://jbloomlab.github.io/phydms/index.html) for specifics on these models. -We are going to walk through an example of [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) for the $\beta$-lactamase gene. We will compare an *ExpCM* with deep mutational scanning data from [Stiffler *et al*, 2015](http://www.sciencedirect.com/science/article/pii/S0092867415000781) to the *YNGKP* family of models. +We are going to walk through an example of [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) for the $\beta$-lactamase gene. We will compare an *ExpCM* with deep mutational scanning data from [Stiffler *et al*, 2015](http://www.sciencedirect.com/science/article/pii/S0092867415000781) to the *YNGKP* family of models. ##`phydms_comprehensive` command-line usage -Here is the full list of requirements and options for [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html). Below is a discussion of the [input files](#inputFiles), running the [`phydms_comprehensive` command](#phydms_comprehensiveCommand), and [interpretation of the results](#interpretation). +Here is the full list of requirements and options for [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html). Below is a discussion of the [input files](#inputFiles), running the [`phydms_comprehensive` command](#phydms_comprehensiveCommand), and [interpretation of the results](#interpretation). ```{r phydms_comprehensive help, engine='bash', comment=NA, error=TRUE} phydms_comprehensive -h -``` +``` ##Input Files{#inputFiles} This example uses the following files: [`betaLactamase_enrichmentScores.csv`](example_data/betaLactamase_enrichmentScores.csv), [`betaLactamase_prefs.csv`](example_data/betaLactamase_prefs.csv), and [`betaLactamase_alignment.fasta`](example_data/betaLactamase_alignment.fasta). ###Amino-acid preferences{#prefs} -Often data from deep mutational scanning experiments is reported as the enrichment of a given amino acid compared to the wild-type amino acid. +Often data from deep mutational scanning experiments is reported as the enrichment of a given amino acid compared to the wild-type amino acid. Here is a snippet of the log enrichment scores for $\beta$-lactamase from the file [betaLactamase_enrichmentScores.csv](example_data/betaLactamase_enrichmentScores.csv) ([Stiffler *et al*, 2015 `Supplemental File 1`](http://www.sciencedirect.com/science/article/pii/S0092867415000781)) and the full dataset visualized as a heatmap. ```{r enrichment scores heatmap, comment=NA, warning=FALSE, fig.width = 15, fig.height = 10} @@ -50,12 +50,13 @@ p <- ggplot(df, aes(Site, AminoAcid)) + geom_tile(aes(fill = Trial_1_AmpConc_250 p = p + theme_grey(base_size = base_size) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) p -``` +``` -[`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) uses *amino-acid preferences* rather than *enrichment scores*. We can transform these log enrichment scores to amino-acid preferences by normalizing the exponentiation of the scores for each site. Below are the first few sites and amino acids of the resulting [preferences](example_data/betaLactamase_prefs.csv). Notice that the numbering has changed from above. In [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) a site is numbered in relation to the first site in the preferences rather than to the start codon. +[`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) uses *amino-acid preferences* rather than *enrichment scores*. We can transform these log enrichment scores to amino-acid preferences by normalizing the exponentiation of the scores for each site. Below are the first few sites and amino acids of the resulting [preferences](example_data/betaLactamase_prefs.csv). Notice that the numbering has changed from above. In [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) a site is numbered in relation to the first site in the preferences rather than to the start codon. ```{r enrichment scores to preferences, engine="python", comment=NA} import pandas as pd + df = pd.read_csv("example_data/betaLactamase_enrichmentScores.csv") minenrichment = 1.0e-4 # minimum allowed enrichment df["preference"] = [max(minenrichment, (10**df["Trial_1_AmpConc_2500"][x] + 10**df["Trial_2_AmpConc_2500.1"][x])/2) for x in range(len(df))] @@ -63,10 +64,10 @@ df = df.pivot(index = "Site", columns = "AminoAcid", values = "preference")2df.f df = df.div(df.sum(axis=1), axis=0) df.insert(0, "site", range(1,len(df)+1)) df.to_csv("example_data/betaLactamase_prefs.csv", index = False) -print df.iloc[:,:9].head(6).to_string(index = False) -``` +print(df.iloc[:,:9].head(6).to_string(index = False)) +``` -We can visualize the preferences as a logoplot using [phydms_logoplot](http://jbloomlab.github.io/phydms/phydms_logoplot.html): +We can visualize the preferences as a logoplot using [phydms_logoplot](http://jbloomlab.github.io/phydms/phydms_logoplot.html): ```{r bacLactamase logoplot, engine="bash", comment=NA, error=TRUE, include=FALSE} phydms_logoplot logoplots/betaLactamase_prefs.pdf --prefs example_data/betaLactamase_prefs.csv --mapmetric functionalgroup --colormap mapmetric ``` @@ -80,25 +81,28 @@ convert -density 134 -trim logoplots/betaLactamase_prefs.pdf +append logoplots/b For more information on the preference file formats, please see the [full `phydms` documentation](http://jbloomlab.github.io/phydms/phydms_prog.html). For more detailed information on transforming enrichment scores into amino-acid preferences, please see [`dms_tools`: Algorithm to infer site-specific preferences](http://jbloomlab.github.io/dms_tools/inferprefs_algorithm.html). ###Sequences{#seqs} -We will use an [alignment](example_data/betaLactamase_alignment.fasta) of $\beta$-lactamase sequences. It is important to note there is exactly the same number of preferences as sites in the alignment. -```{r alignment, engine="python", comment=NA, echo = FALSE} +We will use an [alignment](example_data/betaLactamase_alignment.fasta) of $\beta$-lactamase sequences. It is important to note there is exactly the same number of preferences as sites in the alignment. +```{r alignment, engine="python", comment=NA, echo = FALSE} import pandas as pd from Bio import SeqIO + seqFileName = "example_data/betaLactamase_alignment.fasta" prefFileName = "example_data/betaLactamase_prefs.csv" -sequences = list(SeqIO.parse(open(seqFileName),'fasta')) -assert len(set([len(x.seq) for x in sequences ])) == 1 -print "Sequences read from: %s"%seqFileName -print "There are %s sequences"%len(sequences) -print "Each sequence is %s amino acids long"%str(len(sequences[0].seq)/3) -print "Preferences read from: %s"%prefFileName -print "There are preferences measured for %s sites"%len(pd.read_csv(prefFileName)) -``` +with open(seqFileName) as file: + sequences = list(SeqIO.parse(file, 'fasta')) +assert len(set(len(x.seq) for x in sequences)) == 1 +prefs_df = pd.read_csv(prefFileName) +print(f"Sequences read from: {seqFileName}") +print(f"There are {len(sequences)} sequences") +print(f"Each sequence is {len(sequences[0].seq) // 3} amino acids long") # Integer division +print(f"Preferences read from: {prefFileName}") +print(f"There are preferences measured for {prefs_df.shape[0]} sites") +``` -You can use the [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) auxiliary program [`phydms_prepalignment`](http://jbloomlab.github.io/phydms/phydms_prepalignment.html) to filter your sequences and prepare an alignment for [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html). +You can use the [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) auxiliary program [`phydms_prepalignment`](http://jbloomlab.github.io/phydms/phydms_prepalignment.html) to filter your sequences and prepare an alignment for [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html). ###`phydms_comprehensive`{#phydms_comprehensiveCommand} -We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `betaLactamase`), our [preferences](#prefs), and our [alignment](#seqs). The output below is the [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run log. +We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `betaLactamase`), our [preferences](#prefs), and our [alignment](#seqs). The output below is the [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run log. ```{r phydms_comprehensive run, engine='bash', comment=NA, error=TRUE} @@ -107,15 +111,15 @@ phydms_comprehensive betaLactamase/ example_data/betaLactamase_alignment.fasta e ##Output files{#outputFiles} -[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. Please see the [full documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. +[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. Please see the [full documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. ```{r phydms_comprehensive output files, engine='bash', comment=NA, error=TRUE} ls betaLactamase/* -``` +``` ##Interpretation of `phydms_comprehensive` results{#interpretation} -To compare the *ExpCM* with $\beta$-lactamase deep mutational scanning data to the *YNGKP* family we can look at the summary file [`modelcomparison.md`](betaLactamase/modelcomparison.md). +To compare the *ExpCM* with $\beta$-lactamase deep mutational scanning data to the *YNGKP* family we can look at the summary file [`modelcomparison.md`](betaLactamase/modelcomparison.md). | Model | deltaAIC | LogLikelihood | nParams | ParamValues | @@ -139,28 +143,28 @@ phydms_logoplot logoplots/betaLactamase_prefs_beta3.pdf --prefs logoplots/exampl convert -density 134 -trim logoplots/betaLactamase_prefs_beta1.pdf +append logoplots/betaLactamase_prefs_beta1.png convert -density 134 -trim logoplots/betaLactamase_prefs_beta0.pdf +append logoplots/betaLactamase_prefs_beta0.png convert -density 134 -trim logoplots/betaLactamase_prefs_beta3.pdf +append logoplots/betaLactamase_prefs_beta3.png -``` -We can also evaluate the amino-acid preferences by the value of the *ExpCM* stringency parameter, $\beta$. $\beta$ is a way to gauge how well the selection in the lab compares to selection in nature. When $\beta$ is fit to be greater than $1$ it means the selection in nature prefers the same amino acids but with a greater stringency. The converse is true when $\beta$ is less than $1$. When our preferences are re-scaled with a $\beta>1$, we see the strongly preferred amino acids "grow" while the weakly preferred amino acids "shrink". +``` +We can also evaluate the amino-acid preferences by the value of the *ExpCM* stringency parameter, $\beta$. $\beta$ is a way to gauge how well the selection in the lab compares to selection in nature. When $\beta$ is fit to be greater than $1$ it means the selection in nature prefers the same amino acids but with a greater stringency. The converse is true when $\beta$ is less than $1$. When our preferences are re-scaled with a $\beta>1$, we see the strongly preferred amino acids "grow" while the weakly preferred amino acids "shrink". -![Above is a snippet of the $\beta$-lactamase preferences without scaling by $\beta$](logoplots/betaLactamase_prefs_beta1.png) +![Above is a snippet of the $\beta$-lactamase preferences without scaling by $\beta$](logoplots/betaLactamase_prefs_beta1.png) -![Above is a snippet of the $\beta$-lactamase preferences scaled by $\beta = 3$](logoplots/betaLactamase_prefs_beta3.png) +![Above is a snippet of the $\beta$-lactamase preferences scaled by $\beta = 3$](logoplots/betaLactamase_prefs_beta3.png) -When $\beta=0$, the heights become uniform and the *ExpCM* loses all of its site-specific information. -![Above is a snippet of the $\beta$-lactamase preferences scaled by $\beta = 0$](logoplots/betaLactamase_prefs_beta0.png) +When $\beta=0$, the heights become uniform and the *ExpCM* loses all of its site-specific information. +![Above is a snippet of the $\beta$-lactamase preferences scaled by $\beta = 0$](logoplots/betaLactamase_prefs_beta0.png) In the $\beta$-lactamase example, $\beta$ is fit to be $1.36$. **Since this number is close to $1$, we can conclude that not only is the *ExpCM* with amino-acid preferences a better description of natural sequence evolution than non-site specific models but that natural evolution prefers the same amino acids that are preferred in the experiment, but with slightly greater stringency.** For more information on scaling amino-acid preferences by $\beta$, please see [Bloom, 2014](http://research.fhcrc.org/content/dam/stripe/bloom/labfiles/publications/Bloom2014b.pdf). -Please see the [full documentation](http://jbloomlab.github.io/phydms/index.html) if you would like to learn more about the [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) program and its other options. - +Please see the [full documentation](http://jbloomlab.github.io/phydms/index.html) if you would like to learn more about the [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) program and its other options. + #Comparing two `DMS` datasets for the same protein If you perform a deep mutational scanning experiment multiple times under slightly different experimental conditions, you may want to compare how well each dataset explains natural sequence variation. These experimental differences could be in how the variant libraries were generated, how the selection pressure was exerted, etc. We can use [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) to compare *ExpCM* models with two or more different sets of preferences to both the *YNGKP* family of models and to each other. For this [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) analysis, you will need multiple sets of [amino-acid preferences](#prefsHA) for the same protein from your deep mutational scanning experiments and a codon-level sequence [alignment](#seqsHA) of your gene. [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) will then -• infer a tree using [`RAxML`](http://sco.h-its.org/exelixis/software.html) -• run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) with the *YNGKP_M0*, the *YNKGP_M5* and the *ExpCM* (with and without averaged preferences) for each set of preferences -• summarize the results +• infer a tree using [`RAxML`](http://sco.h-its.org/exelixis/software.html) +• run [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) with the *YNGKP_M0*, the *YNKGP_M5* and the *ExpCM* (with and without averaged preferences) for each set of preferences +• summarize the results See the [full documentation](http://jbloomlab.github.io/phydms/index.html) for specifics on these models or the [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) program. @@ -212,13 +216,13 @@ import glob seqFileName = "example_data/HA_alignment.fasta" sequences = list(SeqIO.parse(open(seqFileName),'fasta')) assert len(set([len(x.seq) for x in sequences ])) == 1 -print "Sequences read from: %s"%seqFileName -print "There are %s sequences"%len(sequences) -print "Each sequence is %s amino acids long"%str(len(sequences[0].seq)/3) -print +print(f"Sequences read from: {seqFileName}") +print(f"There are {len(sequences)} sequences") +print(f"Each sequence is {str(len(sequences[0].seq)/3)} amino acids long") +print() for prefFileName in glob.glob("example_data/HA_prefs_*"): - print "Preferences read from: %s"%prefFileName - print "There are preferences measured for %s sites"%len(pd.read_csv(prefFileName)) + print(f"Preferences read from: {prefFileName}") + print(f"There are preferences measured for {len(pd.read_csv(prefFileName)} sites") ``` You can use the [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) auxiliary program [`phydms_prepalignment`](http://jbloomlab.github.io/phydms/phydms_prepalignment.html) to filter your sequences and prepare an alignment for [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html). @@ -284,17 +288,17 @@ for protein in proteins.keys(): prefFileName = proteins[protein][1] sequences = list(SeqIO.parse(open(seqFileName),'fasta')) assert len(set([len(x.seq) for x in sequences ])) == 1 - print protein - print "Sequences read from: %s"%seqFileName - print "There are %s sequences"%len(sequences) - print "Each sequence is %s amino acids long"%str(len(sequences[0].seq)/3) - print "Preferences read from: %s"%prefFileName - print "There are preferences measured for %s sites"%len(pd.read_csv(prefFileName)) - print + print(protein) + print(f"Sequences read from: {seqFileName}") + print(f"There are {len(sequences)} sequences") + print(f"Each sequence is {len(sequences[0].seq)/3} amino acids long") + print(f"Preferences read from: {prefFileName}") + print(f"There are preferences measured for {len(pd.read_csv(prefFileName))} sites") + print() ``` ###`phydms_comprehensive`{#phydms_comprehensiveCommandAll} -We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `HA_omegabysite` or `betaLactmase_omegabysite`), our [preferences](#inputFilesAll), our [alignment](#inputFilesAll) and the flag `--omegabysite`. Each alignment requires its own [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run. +We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `HA_omegabysite` or `betaLactmase_omegabysite`), our [preferences](#inputFilesAll), our [alignment](#inputFilesAll) and the flag `--omegabysite`. Each alignment requires its own [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run. ```{r diversifying selection HA, engine='bash', comment=NA, error=TRUE, results="hide"} @@ -304,7 +308,7 @@ phydms_comprehensive betaLactamase_omegabysite/ example_data/betaLactamase_align ``` ##Output files -[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. See the [Comparing your `DMS` data to natural sequence evolution](#outputFiles) section of the tutorial for an example and the [full `phydms` documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. +[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. See the [Comparing your `DMS` data to natural sequence evolution](#outputFiles) section of the tutorial for an example and the [full `phydms` documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. ##Interpretation of `phydms_comprehensive` results{#interpretationAll} @@ -327,7 +331,7 @@ Here are the sites in $\beta$-lactamase with the strongest evidence for deviatio head -n 20 betaLactamase_omegabysite/ExpCM_betaLactamase_prefs_omegabysite.txt ``` -We can see in both examples there is a small subset of sites which deviate display a faster ($\omega_r>1$) or slower ($\omega_r <1$) than expected rate of amino-acid substitutions. +We can see in both examples there is a small subset of sites which deviate display a faster ($\omega_r>1$) or slower ($\omega_r <1$) than expected rate of amino-acid substitutions. #Detecting Differential Selection{#differentialSelection} @@ -358,17 +362,17 @@ for protein in proteins.keys(): prefFileName = proteins[protein][1] sequences = list(SeqIO.parse(open(seqFileName),'fasta')) assert len(set([len(x.seq) for x in sequences ])) == 1 - print protein - print "Sequences read from: %s"%seqFileName - print "There are %s sequences"%len(sequences) - print "Each sequence is %s amino acids long"%str(len(sequences[0].seq)/3) - print "Preferences read from: %s"%prefFileName - print "There are preferences measured for %s sites"%len(pd.read_csv(prefFileName)) - print + print(protein) + print(f"Sequences read from: {seqFileName}") + print(f"There are {len(sequences)} sequences") + print(f"Each sequence is {len(sequences[0].seq)/3} amino acids long") + print(f"Preferences read from: {prefFileName}") + print(f"There are preferences measured for {len(pd.read_csv(prefFileName))} sites") + print() ``` ###`phydms_comprehensive`{#phydms_comprehensiveCommanddiff} -We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `HA_diffprefs` or `betaLactmase_diffprefs`), our [preferences](#inputFilesdiff), our [alignment](#inputFilesdiff) and the flag `--diffprefsbysite`. Each alignment requires its own [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run. +We can now run [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) by specifying our output prefix (in this case a directory called `HA_diffprefs` or `betaLactmase_diffprefs`), our [preferences](#inputFilesdiff), our [alignment](#inputFilesdiff) and the flag `--diffprefsbysite`. Each alignment requires its own [`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) run. ```{r diff selection HA, engine='bash', comment=NA, error=TRUE, results="hide"} @@ -378,7 +382,7 @@ phydms_comprehensive betaLactamase_diffprefs/ example_data/betaLactamase_alignme ``` ##Output files -[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. See the [Comparing your `DMS` data to natural sequence evolution](#outputFiles) section of the tutorial for an example and the [full `phydms` documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. +[`phydms_comprehensive`](http://jbloomlab.github.io/phydms/phydms_comprehensive_prog.html) produces both the standard [`phydms`](http://jbloomlab.github.io/phydms/phydms_prog.html) output files for each model and a summary file. See the [Comparing your `DMS` data to natural sequence evolution](#outputFiles) section of the tutorial for an example and the [full `phydms` documentation](http://jbloomlab.github.io/phydms/phydms.html) for more information on the standard output files. ##Interpretation of `phydms_comprehensive` results{#interpretationdiff} To detect sites under differential selection, we can look at the summary files with the suffix `diffprefsbysite.txt`. First, we will look at the results from the HA analysis. @@ -390,24 +394,24 @@ Here are the sites (and a subset of the differential preferences) in HA with the ```{r phydms_comprehensive HA diff head, engine='python', comment=NA, error=TRUE} import pandas as pd df = pd.read_csv("HA_diffprefs/ExpCM_HA_prefs_Doud_diffprefsbysite.txt", sep='\t', skiprows=(0,1,2)) -print df.iloc[:,range(9) + [-1]].head(6).to_string(index = False) +print(df.iloc[:,range(9) + [-1]].head(6).to_string(index = False)) ``` and sites with the weakest evidence ```{r phydms_comprehensive HA diff tail, engine='python', comment=NA, error=TRUE} import pandas as pd df = pd.read_csv("HA_diffprefs/ExpCM_HA_prefs_Doud_diffprefsbysite.txt", sep='\t', skiprows=(0,1,2)) -print df.iloc[:,range(9) + [-1]].tail(6).to_string(index = False) +print(df.iloc[:,range(9) + [-1]].tail(6).to_string(index = False)) ``` Here are the sites in $\beta$-lactamase with the strongest evidence for differential selection ```{r phydms_comprehensive beta diff head, engine='python', comment=NA, error=TRUE} import pandas as pd df = pd.read_csv("betaLactamase_diffprefs/ExpCM_betaLactamase_prefs_diffprefsbysite.txt", sep='\t', skiprows=(0,1,2)) -print df.iloc[:,range(9) + [-1]].head(6).to_string(index = False) +print(df.iloc[:,range(9) + [-1]].head(6).to_string(index = False)) ``` -We visualize the $\beta$-lactamase differential preferences for each site using [`phydms_logoplot`](http://jbloomlab.github.io/phydms/phydms_logoplot.html) +We visualize the $\beta$-lactamase differential preferences for each site using [`phydms_logoplot`](http://jbloomlab.github.io/phydms/phydms_logoplot.html) ```{r diff example logoplot, engine="bash", comment=NA, error=TRUE, results="hide"} phydms_logoplot logoplots/betaLactamase_prefs_diff.pdf --mapmetric functionalgroup --colormap mapmetric --diffprefs betaLactamase_diffprefs/ExpCM_betaLactamase_prefs_diffprefsbysite.txt @@ -415,8 +419,8 @@ phydms_logoplot logoplots/betaLactamase_prefs_diff.pdf --mapmetric functionalgro ```{r diff stringency example logoplot convert, engine="bash", comment=NA, error=TRUE, echo=FALSE, include=FALSE} convert -density 134 -trim logoplots/betaLactamase_prefs_diff.pdf +append logoplots/betaLactamase_prefs_diff.png -``` +``` -![](logoplots/betaLactamase_prefs_diff.png) +![](logoplots/betaLactamase_prefs_diff.png) We can see in both the HA and the $\beta$-lactamase example, there are a small subset of sites which appear to be under differential selection.