Skip to content

Tree & Likelihood Computation

Laurent Guéguen edited this page Nov 10, 2021 · 1 revision

Problem

Given an alignment compute a distance tree under a model, and then compute maximum likelihood (including branch lengths) on this tree and alignment.

Solution

#include <Bpp/Seq/Alphabet/DNA.h>
#include <Bpp/Seq/Container/VectorSiteContainer.h>
#include <Bpp/Seq/Io/Fasta.h>
#include <Bpp/Phyl/Model/Nucleotide/T92.h>
#include <Bpp/Phyl/Model/FrequencySet/NucleotideFrequencySet.h>
#include <Bpp/Phyl/Model/RateDistribution/ConstantRateDistribution.h>
#include <Bpp/Phyl/Distance/DistanceEstimation.h>
#include <Bpp/Phyl/Distance/BioNJ.h>
#include <Bpp/Phyl/OptimizationTools.h>
#include <Bpp/Phyl/Tree/PhyloTreeTools.h>

#include <Bpp/Phyl/Likelihood/ParametrizablePhyloTree.h>
#include <Bpp/Phyl/Likelihood/NonHomogeneousSubstitutionProcess.h>
#include <Bpp/Phyl/Likelihood/DataFlow/LikelihoodCalculationSingleProcess.h>

#include <iostream>

using namespace std;
using namespace bpp;

int main (int argc, char* argv[]){
	
string nameSeq = "lysozymeLarge.fasta";
Fasta Fst;
 
auto alpha = std::make_shared<DNA>();

auto sites = std::shared_ptr<AlignedSequenceContainer>(Fst.readAlignment(nameSeq , alpha.get()));

auto model = std::make_shared<T92>(alpha.get(), 3., .1);
auto rdist = std::make_shared<ConstantRateDistribution>();

auto rootFreqs = std::make_shared<GCFrequencySet>(alpha.get());

// Compute ML distance between sequences
DistanceEstimation distEstimation(model, rdist, sites.get(), 1, false);

// Method of clustering
auto distMethod = std::make_shared<BioNJ>();
distMethod->outputPositiveLengths(true);

// Build tree
ParameterList parametersToIgnore;
TreeTemplate<Node>* tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore);

// Optimize likelihood on this tree 

/* Convert Tree in PhyloTree */
auto phyloT = PhyloTreeTools::buildFromTreeTemplate(*tree);
ParametrizablePhyloTree parTree(*phyloT);

/* Create Substitution Process */
auto subProc= NonHomogeneousSubstitutionProcess::createHomogeneousSubstitutionProcess(model, rdist, &parTree, rootFreqs);

/* Put objects in DataFlow */
Context context;
auto lik = std::make_shared<LikelihoodCalculationSingleProcess>(context, *sites, *subProc);

SingleProcessPhyloLikelihood ntl(context, lik);

cout << endl << endl;
cout << "Initial value " << ntl.getValue() << endl;
OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));


OptimizationTools::optimizeNumericalParameters2( ntl, ntl.getParameters(), 0,
                                                 0.0001, 10000, messenger, profiler, false, false, 1,
                                                 OptimizationTools::OPTIMIZATION_NEWTON);

cout << "Final value " << ntl.getValue() << endl;

ntl.getParameters().printParameters(cout);

/* Update Substitution Process parameters */

subProc->matchParametersValues(ntl.getParameters());

subProc->getParameters().printParameters(cout);

}

Clone this wiki locally