diff --git a/Align2/APPS/subali.cc b/Align2/APPS/subali.cc index 8157fc4..0a2f971 100755 --- a/Align2/APPS/subali.cc +++ b/Align2/APPS/subali.cc @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -101,6 +102,7 @@ sShowHelp() { << "\n \t --sf=8: AtchleyCorrelation." << "\n" << "\n [--global] \t Needleman-Wunsch global alignment (default)" + << "\n [--gotoh] \t Needleman-Wunsch-Gotoh global alignment" << "\n [--local] \t Smith-Waterman local alignment" << "\n [--freeshift] \t Free-shift alignment" << "\n [-n ] \t Number of suboptimal alignments (default = 1)" @@ -156,7 +158,7 @@ main(int argc, char **argv) { double weightHelix, weightStrand, weightBuried, weightStraight, weightSpace; double cSeq, cStr; unsigned int weightingScheme, scoringFunction, suboptNum, gapFunction, extensionType, structure; - bool fasta, global, local, freeshift, verbose; + bool fasta, global, gotoh, local, freeshift, verbose; struct tm* newtime; time_t t; @@ -182,9 +184,10 @@ main(int argc, char **argv) { getArg("-sf", scoringFunction, argc, argv, 1); global = getArg("-global", argc, argv); + gotoh = getArg("-gotoh", argc, argv); local = getArg("-local", argc, argv); freeshift = getArg("-freeshift", argc, argv); - if (!local && !freeshift) + if (!local && !freeshift && !gotoh) global = true; getArg("n", suboptNum, argc, argv, 1); getArg("p", suboptPenaltyMul, argc, argv, 1.00); @@ -539,20 +542,26 @@ main(int argc, char **argv) { Align *a; if (global) { - cout << "\nSuboptimal Needleman-Wunsch alignments:\n" << endl; - a = new NWAlign(ad, gf, ss); + cout << "\nSuboptimal Needleman-Wunsch alignments:\n" << endl; + a = new NWAlign(ad, gf, ss); } else - if (local) { - cout << "\nSuboptimal Smith-Waterman alignments:\n" << endl; - a = new SWAlign(ad, gf, ss); - } else { - cout << "\nSuboptimal free-shift alignments:\n" << endl; - try { - a = new FSAlign(ad, gf, ss); - } catch (const char* a) { - cout << "FSAlign error!\n"; - } - } + if (local) { + cout << "\nSuboptimal Smith-Waterman alignments:\n" << endl; + a = new SWAlign(ad, gf, ss); + } + else + if (gotoh) { + cout << "\nSuboptimal Needleman-Wunsch-Gotoh alignments:\n" << endl; + a = new NWGAlign(ad, gf, ss); + } + else { + cout << "\nSuboptimal free-shift alignments:\n" << endl; + try { + a = new FSAlign(ad, gf, ss); + } catch (const char* a) { + cout << "FSAlign error!\n"; + } + } time(&t); newtime = localtime(&t); cout << "object FSAlign created " << newtime->tm_hour << "/" << newtime->tm_min << endl; diff --git a/Align2/Sources/Makefile b/Align2/Sources/Makefile index 196b09a..7c1f6d3 100755 --- a/Align2/Sources/Makefile +++ b/Align2/Sources/Makefile @@ -35,7 +35,7 @@ SOURCES = Alignment.cc AlignmentBase.cc \ PssmInput.cc Profile.cc HenikoffProfile.cc PSICProfile.cc SeqDivergenceProfile.cc \ LogAverage.cc CrossProduct.cc DotPFreq.cc DotPOdds.cc Pearson.cc JensenShannon.cc EDistance.cc AtchleyDistance.cc AtchleyCorrelation.cc Panchenko.cc Zhou.cc \ ThreadingInput.cc Ss2Input.cc ProfInput.cc Sec.cc Threading.cc Ss2.cc Prof.cc ThreadingSs2.cc ThreadingProf.cc \ - ReverseScore.cc stringtools.cc + ReverseScore.cc stringtools.cc NWGAlign.cc OBJECTS = Alignment.o AlignmentBase.o \ Align.o NWAlign.o SWAlign.o FSAlign.o NWAlignNoTermGaps.o \ @@ -46,7 +46,7 @@ OBJECTS = Alignment.o AlignmentBase.o \ PssmInput.o Profile.o HenikoffProfile.o PSICProfile.o SeqDivergenceProfile.o \ LogAverage.o CrossProduct.o DotPFreq.o DotPOdds.o Pearson.o JensenShannon.o EDistance.o AtchleyDistance.o AtchleyCorrelation.o Panchenko.o Zhou.o \ ThreadingInput.o Ss2Input.o ProfInput.o Sec.o Threading.o Ss2.o Prof.o ThreadingSs2.o ThreadingProf.o \ - ReverseScore.o stringtools.o + ReverseScore.o stringtools.o NWGAlign.o TARGETS = diff --git a/Align2/Sources/NWAlign.cc b/Align2/Sources/NWAlign.cc index 9636cbc..8deeff3 100755 --- a/Align2/Sources/NWAlign.cc +++ b/Align2/Sources/NWAlign.cc @@ -125,14 +125,14 @@ namespace Victor { namespace Align2{ for (int i = 1; i <= static_cast (n); i++) { if (update) - F[i][0] = -gf->getOpenPenalty(0) - - gf->getExtensionPenalty(0) * (i - 1); + F[i][0] = -gf->getOpenPenalty(1) - + gf->getExtensionPenalty(i) * (i - 1); B[i][0] = Traceback(i - 1, 0); } for (int j = 1; j <= static_cast (m); j++) { if (update) - F[0][j] = -gf->getOpenPenalty(j) - + F[0][j] = -gf->getOpenPenalty(1) - gf->getExtensionPenalty(j) * (j - 1); B[0][j] = Traceback(0, j - 1); } @@ -144,12 +144,12 @@ namespace Victor { namespace Align2{ if ((i != 1) && (j != 1)) { if (B[i - 1][j].j == j) - extI = F[i - 1][j] - gf->getExtensionPenalty(j); + extI = F[i - 1][j] - gf->getExtensionPenalty(i); else if (B[i - 1][j].j == (j - 1)) - extI = F[i - 1][j] - gf->getOpenPenalty(j); + extI = F[i - 1][j] - gf->getOpenPenalty(i); } else - extI = F[i - 1][j] - gf->getOpenPenalty(j); + extI = F[i - 1][j] - gf->getOpenPenalty(i); if ((i != 1) && (j != 1)) { if (B[i][j - 1].i == i) @@ -202,14 +202,14 @@ namespace Victor { namespace Align2{ for (int i = 1; i <= static_cast (n); i++) { if (update) - F[i][0] = -gf->getOpenPenalty(0) - - gf->getExtensionPenalty(0) * (i - 1); + F[i][0] = -gf->getOpenPenalty(1) - + gf->getExtensionPenalty(i) * (i - 1); B[i][0] = Traceback(i - 1, 0); } for (int j = 1; j <= static_cast (m); j++) { if (update) - F[0][j] = -gf->getOpenPenalty(j) - + F[0][j] = -gf->getOpenPenalty(1) - gf->getExtensionPenalty(j) * (j - 1); B[0][j] = Traceback(0, j - 1); } @@ -229,12 +229,12 @@ namespace Victor { namespace Align2{ if ((i != 1) && (j != 1)) { if (B[i - 1][j].j == j) - extI = F[i - 1][j] - gf->getExtensionPenalty(j); + extI = F[i - 1][j] - gf->getExtensionPenalty(i); else if (B[i - 1][j].j == (j - 1)) - extI = F[i - 1][j] - gf->getOpenPenalty(j); + extI = F[i - 1][j] - gf->getOpenPenalty(i); } else - extI = F[i - 1][j] - gf->getOpenPenalty(j); + extI = F[i - 1][j] - gf->getOpenPenalty(i); if ((i != 1) && (j != 1)) { if (B[i][j - 1].i == i) diff --git a/Align2/Sources/NWGAlign.cc b/Align2/Sources/NWGAlign.cc new file mode 100755 index 0000000..ab2c782 --- /dev/null +++ b/Align2/Sources/NWGAlign.cc @@ -0,0 +1,330 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ +// --*- C++ -*------x----------------------------------------------------------- +// +// +// Description: Implement Needleman-Wunsch-Gotoh global alignment. +// +// -----------------x----------------------------------------------------------- + +#include + +namespace Victor { namespace Align2{ + + static int DIAG = 0; + static int LEFT = 1; + static int UP = 2; + + // CONSTRUCTORS: + /** + * + * @param ad + * @param gf + * @param ss + */ + NWGAlign::NWGAlign(AlignmentData *ad, GapFunction *gf, ScoringScheme *ss) + : Align(ad, gf, ss), F1((ad->getSequence(1)).size() + 1), + F2((ad->getSequence(1)).size() + 1), + D((ad->getSequence(1)).size() + 1), + D1((ad->getSequence(1)).size() + 1), + D2((ad->getSequence(1)).size() + 1){ + + vector frow(m + 1, -std::numeric_limits::max()); + vector drow(m + 1, -1); + + for (unsigned int i = 0; i < F.size(); ++i) { + F[i] = frow; + F1[i] = frow; + F2[i] = frow; + D[i] = drow; + D1[i] = drow; + D2[i] = drow; + } + + pCalculateMatrix(true); + } + /** + * + * @param ad + * @param gf + * @param ss + * @param v1 + * @param v2 + */ + NWGAlign::NWGAlign(AlignmentData *ad, GapFunction *gf, ScoringScheme *ss, + const vector &v1, const vector &v2) + : Align(ad, gf, ss) { + pCalculateMatrix(v1, v2, true); + } + + NWGAlign::NWGAlign(const NWGAlign &orig) : Align(orig) { + } + + NWGAlign::~NWGAlign() { + } + + + // OPERATORS: + /** + * + * @param orig + * @return + */ + NWGAlign& + NWGAlign::operator =(const NWGAlign &orig) { + if (&orig != this) + copy(orig); + POSTCOND((orig == *this), exception); + return *this; + } + + + // PREDICATES: + /** + * + */ + void + NWGAlign::getMultiMatch() { + //TODO + } + + vector + NWGAlign::getMatch() const { + + string res1, res2; + res1Pos.clear(); + res2Pos.clear(); + + int i = n; + int j = m; + int nextI = -1; + int nextJ = -1; + + + double val = max(max(F[n][m], F1[n][m]), F2[n][m]); + + int K = -1; + + if(val == F[n][m]){ + K = DIAG; + } + else if(val == F2[n][m]){ + K = LEFT; + } + else if(val == F1[n][m]){ + K = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + while (i > 0 || j > 0) { + + if(K == DIAG){ + nextI = i-1; + nextJ = j-1; + K = D[i][j]; + } + else if(K == UP){ + nextI = i-1; + nextJ = j; + K = D1[i][j]; + } + else if(K == LEFT){ + nextI = i; + nextJ = j-1; + K = D2[i][j]; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + if (i == nextI) { + res1 = res1 + "-"; + res1Pos.push_back(INVALID_POS); + } else { + res1 = res1 + (ad->getSequence(1))[i - 1]; + res1Pos.push_back(i - 1); + } + if (j == nextJ) { + res2 = res2 + "-"; + res2Pos.push_back(INVALID_POS); + } else { + res2 = res2 + (ad->getSequence(2))[j - 1]; + res2Pos.push_back(j - 1); + } + ad->calculateMatch(i, nextI, j, nextJ); + + i = nextI; + j = nextJ; + } + + reverse(res1.begin(), res1.end()); + reverse(res2.begin(), res2.end()); + reverse(res1Pos.begin(), res1Pos.end()); + reverse(res2Pos.begin(), res2Pos.end()); + vector res(2); + res[0] = res1; + res[1] = res2; + + return res; + } + + double NWGAlign::getScore() const { + return max(max(F[n][m], F1[n][m]), F2[n][m]); + } + + /** + * + * @param num + * @return + */ + vector + NWGAlign::generateMultiMatch(unsigned int num) { + + //TODO draft version for one optimal alignment + + vector va; + this->getMatch(); + double tmpScore = this->getScore(); + ad->getMatch(); + Alignment *tmp = new Alignment; + *tmp = ad->generateMatch(tmpScore); + va.push_back(*tmp); + + return va; + } + + // MODIFIERS: + + void + NWGAlign::copy(const NWGAlign &orig) { + Align::copy(orig); + } + /** + * + * @return + */ + NWGAlign* + NWGAlign::newCopy() { + NWGAlign *tmp = new NWGAlign(*this); + return tmp; + } + + // HELPERS: + /** + * + * @param update + */ + void + NWGAlign::pCalculateMatrix(bool update) { + + if (update){ + F[0][0] = F1[0][0] = F2[0][0] = 0; + + //Init first column + for (int i = 1; i <= static_cast (n); i++) { + F1[i][0] = -gf->getOpenPenalty(1) - gf->getExtensionPenalty(i) * (i - 1); + D1[i][0] = UP; + F[i][0] = F2[i][0] = -std::numeric_limits::max(); + } + + //Init first row + for (int j = 1; j <= static_cast (m); j++) { + F2[0][j] = -gf->getOpenPenalty(1) - gf->getExtensionPenalty(j) * (j - 1); + D2[0][j] = LEFT; + F[0][j] = F1[0][j] = -std::numeric_limits::max(); + } + } + + for (int i = 1; i <= static_cast (n); i++){ + for (int j = 1; j <= static_cast (m); j++) { + + //F - diag (i-1,j-1) + double s = ss->scoring(i, j); + double m = F[i-1][j-1] + s; + double m1 = F1[i-1][j-1] + s; + double m2 = F2[i-1][j-1] + s; + double mv = max(max(m, m1), m2); + + if(mv == m){ + F[i][j] = mv; + D[i][j] = DIAG; + } + else if(mv == m2){ + F[i][j] = m2; + D[i][j] = LEFT; + } + else if(mv == m1){ + F[i][j] = m1; + D[i][j] = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + //F1 - up (i-1,j) + m = F[i-1][j] - gf->getOpenPenalty(i); + m1 = F1[i-1][j] - gf->getExtensionPenalty(i); + mv = max(m, m1); + + if(mv == m){ + F1[i][j] = mv; + D1[i][j] = DIAG; + } + else if(mv == m1){ + F1[i][j] = m1; + D1[i][j] = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + //F2 - left (i,j-1) + m = F[i][j-1] - gf->getOpenPenalty(j); + m2 = F2[i][j-1] - gf->getExtensionPenalty(j); + mv = max(m, m2); + + if(mv == m){ + F2[i][j] = mv; + D2[i][j] = DIAG; + } + else if(mv == m2){ + F2[i][j] = m2; + D2[i][j] = LEFT; + } + else{ + ERROR("Error in NWGAlign", exception); + } + } + } + } + + // SSEA variant + /** + * + * @param v1 + * @param v2 + * @param update + */ + void + NWGAlign::pCalculateMatrix(const vector &v1, + const vector &v2, bool update) { + + //TODO + } + +}} // namespace diff --git a/Align2/Sources/NWGAlign.h b/Align2/Sources/NWGAlign.h new file mode 100755 index 0000000..4338b02 --- /dev/null +++ b/Align2/Sources/NWGAlign.h @@ -0,0 +1,98 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __NWGAlign_H__ +#define __NWGAlign_H__ + +#include + +namespace Victor { namespace Align2{ + + /** @brief Implement Needleman-Wunsch-Gotoh global alignment. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + + **/ + class NWGAlign : public Align { + public: + + // CONSTRUCTORS: + + /// Default constructor. + NWGAlign(AlignmentData *ad, GapFunction *gf, ScoringScheme *ss); + + /// Constructor with weighted alignment positions. + NWGAlign(AlignmentData *ad, GapFunction *gf, ScoringScheme *ss, + const vector &v1, const vector &v2); + + /// Copy constructor. + NWGAlign(const NWGAlign &orig); + + /// Destructor. + virtual ~NWGAlign(); + + + // OPERATORS: + + /// Assignment operator. + NWGAlign& operator =(const NWGAlign &orig); + + + // PREDICATES: + + /// Return two-element array containing an alignment with maximal score. + virtual void getMultiMatch(); + + ///Generate and return an ensemble of suboptimal alignments. + virtual vector generateMultiMatch(unsigned int num = 1); + + // MODIFIERS: + + /// Copy orig object to this object ("deep copy"). + virtual void copy(const NWGAlign &orig); + + /// Construct a new "deep copy" of this object. + virtual NWGAlign* newCopy(); + + /// Return two-element array containing an alignment with maximal score. + virtual vector getMatch() const; + + /// Return alignment score. + virtual double getScore() const; + + // HELPERS: + + /// Update/create matrix values. + virtual void pCalculateMatrix(bool update = true); + + /// Update/create weighted matrix values. + virtual void pCalculateMatrix(const vector &v1, + const vector &v2, bool update = true); + + protected: + + private: + //F from Align class as Gotoh diag score matrix. + vector< vector > F1; ///< Gotoh left score matrix. + vector< vector > F2; ///< Gotoh up score matrix. + vector< vector > D; ///< Gotoh diag traceback matrix. + vector< vector > D1; ///< Gotoh left traceback matrix. + vector< vector > D2; ///< Gotoh up traceback matrix. + }; + +}} // namespace + +#endif diff --git a/Align2/Sources/SubMatrix.cc b/Align2/Sources/SubMatrix.cc index ff13ac4..f879425 100755 --- a/Align2/Sources/SubMatrix.cc +++ b/Align2/Sources/SubMatrix.cc @@ -32,6 +32,32 @@ namespace Victor { namespace Align2{ SubMatrix::SubMatrix(istream &is) : Substitution() { is >> *this; + double res = residues.size()-2; //avoids '-' + double avg1 = 0, avg2 = 0, avg3 = 0; + for(int i=0;i<=res;i++){ + for(int j=0;j<=i;j++){ + double value = score[residues[i]][residues[j]]; + avg1 += value; + if(residues[i] == residues[j]){ + avg2 += value; + } + else{ + avg3 += value; + } + } + } + minScore = score[0][0]; + for(int i=0;i<= res;i++){ + for(int j=0;j<=i;j++){ + double value = score[residues[i]][residues[j]]; + if(value < minScore){ + minScore = value; + } + } + } + averageScore = avg1 / ((res*res)/2); + averageMatchScore = avg2 / res; + averageMismatchScore = avg3 / (((float)(res*res-res))/2); } SubMatrix::SubMatrix(const SubMatrix &orig) { @@ -96,6 +122,10 @@ namespace Victor { namespace Align2{ residuescores.push_back(orig.residuescores[n]); residues = orig.residues; + averageMatchScore = orig.averageMatchScore; + averageMismatchScore = orig.averageMatchScore; + averageScore = orig.averageScore; + minScore = orig.minScore; } /** * @@ -107,4 +137,20 @@ namespace Victor { namespace Align2{ return tmp; } + double SubMatrix::getAverageMismatchScore(){ + return -averageMismatchScore; + } + + double SubMatrix::getAverageMatchScore(){ + return averageMatchScore; + } + + double SubMatrix::getAverageScore(){ + return averageScore; + } + + double SubMatrix::getMinScore(){ + return minScore; + } + }} // namespace diff --git a/Align2/Sources/SubMatrix.h b/Align2/Sources/SubMatrix.h index 555934d..dc5e216 100755 --- a/Align2/Sources/SubMatrix.h +++ b/Align2/Sources/SubMatrix.h @@ -65,6 +65,17 @@ namespace Victor { namespace Align2{ /// Return the dimension of the matrix. virtual unsigned int size() const; + /// Return the average mismatch score. + double getAverageMismatchScore(); + + /// Return the average match score. + double getAverageMatchScore(); + + /// Return the average score. + double getAverageScore(); + + /// Return the minimal score. + double getMinScore(); // MODIFIERS: @@ -74,7 +85,6 @@ namespace Victor { namespace Align2{ /// Construct a new "deep copy" of this object. virtual SubMatrix* newCopy(); - protected: @@ -84,7 +94,10 @@ namespace Victor { namespace Align2{ vector< vector > residuescores; ///< Similarity scores. string residues; ///< Alphabet of allowed characters. - + double averageMismatchScore; + double averageMatchScore; + double averageScore; + double minScore; }; // ----------------------------------------------------------------------------- diff --git a/Align2/Sources/Substitution.cc b/Align2/Sources/Substitution.cc index 25dd77c..e12627e 100755 --- a/Align2/Sources/Substitution.cc +++ b/Align2/Sources/Substitution.cc @@ -103,14 +103,26 @@ namespace Victor { namespace Align2{ for (unsigned int i = 0; i < residues.size(); i++) { char res1 = residues[i]; - + char res1_bis = res1; + if(res1>=65 && res1<=90){ + res1_bis = res1 + 32; + } + if(res1>=97 && res1<=122){ + res1_bis = res1 - 32; + } for (unsigned int j = 0; j <= i; j++) { char res2 = residues[j]; + char res2_bis = res2; + if(res2>=65 && res2<=90){ + res2_bis = res2 + 32; + } + if(res2>=97 && res2<=122){ + res2_bis = res2 - 32; + } score[res1][res2] = score[res2][res1] = - score[res1][res2 + 32] = score[res2 + 32][res1] = - score[res1 + 32][res2] = score[res2][res1 + 32] = - score[res1 + 32][res2 + 32] = score[res2 + 32][res1 + 32] = - residuescores[i][j]; + score[res1][res2_bis] = score[res2_bis][res1] = + score[res1_bis][res2] = score[res2][res1_bis] = + score[res1_bis][res2_bis] = score[res2_bis][res1_bis] = residuescores[i][j]; } } } diff --git a/Makefile b/Makefile index c0f7aa4..2b7a7bc 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ # Path to project directory. UPDIR = . # Path to subdirectories. -SUBDIRS = tools Energy/Sources Biopool/Sources Align2/Sources Energy/Sources/TorsionPotential Lobo/Sources Lobo/APPS Energy/APPS Biopool/APPS Align2/APPS +SUBDIRS = tools Phylogenesis/Sources Energy/Sources Biopool/Sources Align2/Sources Energy/Sources/TorsionPotential Lobo/Sources Lobo/APPS Energy/APPS Biopool/APPS Align2/APPS Phylogenesis/APPS # # Libraries and paths (which are not defined globally). diff --git a/Phylogenesis/APPS/Makefile b/Phylogenesis/APPS/Makefile new file mode 100755 index 0000000..5aabbe2 --- /dev/null +++ b/Phylogenesis/APPS/Makefile @@ -0,0 +1,53 @@ +#--*- makefile -*-------------------------------------------------------------- +# +# Standard makefile +# +#------------------------------------------------------------------------------ + +# Path to project directory +UPDIR = ../.. +# Path to subdirectories +SUBDIR = +# Path to directory for binaries +BINPATH = ../../bin + +# +# Libraries and paths (which are not defined globally) +# + +LIBS = -lPhylogenesis -lAlign2 -ltools + +LIB_PATH = -L. + +INC_PATH = -I. -I../../Phylogenesis/Sources -I../../tools/ -I../../Align2/Sources + +# +# Objects and headers +# + +SOURCES = PhylogeneticTrees.cc Msa.cc + +OBJECTS = PhylogeneticTrees.o Msa.o + +TARGETS = PhylogeneticTrees Msa + +EXECS = PhylogeneticTrees Msa + +LIBRARY = APPSlibPhylogenesis.a + +# +# Install rule +# + +compile: all + +all: install +install: $(LIBRARY) $(TARGETS) + mv $(LIBRARY) $(UPDIR)/lib/ + mv $(EXECS) $(BINPATH)/ + +# +# Call global Makefile to do the job +# + +include ../../Makefile.global diff --git a/Phylogenesis/APPS/Msa.cc b/Phylogenesis/APPS/Msa.cc new file mode 100644 index 0000000..1394394 --- /dev/null +++ b/Phylogenesis/APPS/Msa.cc @@ -0,0 +1,201 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Victor::Align2; +using namespace Victor::Phylogenesis; +using namespace std; + +void sShowHelp() { + cout<<"\nMULTILPLE SEQUENCE ALIGNMENT GENERATOR" + << "\nThis program is an implementation of ClustalW (Thompson, Higgins & Gibson, 1994)," + << "\none variant of the progressive method for multiple sequence alignment." + << "\nGuide tree is builded with NJ (or UPGMA) and provided as output in the Newick tree format." + << "\nInput must be provided as FASTA format, it is possible to provide also the scoring matrix as input " + << "\nor it can be dynamically calculated using global, local or freeshift alignments." + <<" \n" + << "\nOptions:" + << "\n * [--in ] \t Name of input FASTA file" + << "\n [--verbose] \t Verbose mode" + << "\n" + << "\nDistance matrix:" + << "\n [--global] \t Needleman-Wunsch-Gotoh global alignment (default)" + << "\n [--local] \t Smith-Waterman local alignment" + << "\n [--freeshift] \t Free-shift alignment" + << "\n [--matrix ] \t Custom distance matrix" + << "\n [-m ] \t Name of substitution matrix file (default = blosum30.dat)" + << "\n [-o ] \t Open gap penalty (default = 10.00)" + << "\n [-e ] \t Extension gap penalty (default = 0.10)" + << "\n" + << "\nClustering method:" + << "\n [--upgma] \t Unweighted Pair Group Method with Arithmetic Mean (default = Neighbor joining)" + << "\n" + << "\nClustalW options:" + << "\n [--cwo ] \t Open gap penalty (default = 10.00)" + << "\n [--cwe ] \t Extension gap penalty (default = 0.20)" + << "\n [--cutoff ] \t Cutoff value for divergent sequences (default = 0.4)" + << "\n [--pam ] \t PAM substitution matrix series (default = Blosum)" + << "\n" + << "\nOutput format:" + << "\n [--outNewick ]\t Name of the Newick tree format output file (default = to screen)" + << "\n [--outMsa ] \t Name of the MSA output FASTA file (default = to screen)" + << "\n [--clustal] \t MSA output in clustal format (default = FASTA)" + << "\n\n"; +} + +int main( int argc, char* argv[] ){ + + cout<<"VICTOR - Multiple sequence alignment generator\n\n"; + + // + // Init options + // + + string inputFileName, matrixFileName, cwMatrixFileName, scoringMatrixFileName, newickOutputFileName, msaOutputFileName; + double openGapPenalty, extensionGapPenalty, cwOpenGapPenalty, cwExtensionGapPenalty, cutoff; + bool verbose, upgma, pam, clustal; + + if (getArg("h", argc, argv) || argc == 1) { + sShowHelp(); + return 1; + } + + getArg("-in", inputFileName, argc, argv, "!"); + getArg("-matrix", scoringMatrixFileName, argc, argv, "!"); + getArg("-outNewick", newickOutputFileName, argc, argv, "!"); + getArg("-outMsa", msaOutputFileName, argc, argv, "!"); + + AlignAlgorithm alignAlgorithm = global; + if(getArg("-local", argc, argv)) + alignAlgorithm = local; + if(getArg("-freeshift", argc, argv)) + alignAlgorithm = freeshift; + + getArg("m", matrixFileName, argc, argv, "blosum30.dat"); + getArg("o", openGapPenalty, argc, argv, 10.0); + getArg("e", extensionGapPenalty, argc, argv, 0.1); + + getArg("-cwo", cwOpenGapPenalty, argc, argv, 10.00); + getArg("-cwe", cwExtensionGapPenalty, argc, argv, 0.2); + getArg("-cutoff", cutoff, argc, argv, 0.4); + + upgma = getArg("-upgma", argc, argv); + pam = getArg("-pam", argc, argv); + clustal = getArg("-clustal", argc, argv); + verbose = getArg("-verbose", argc, argv); + + // + // Load data + // + + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + string dataPath = path + "data/"; + + vector sequences; + if (inputFileName != "!") { + ifstream inputFile(inputFileName.c_str()); + if (!inputFile) + ERROR("Error opening input FASTA file.", exception); + sequences = Sequence::LoadFasta(inputFile); + if (sequences.size() < 1) + ERROR("Input FASTA file must contain at least two sequences.", exception); + } else + ERROR("ClustalW needs input FASTA file.", exception); + + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix* sub = new SubMatrix(matrixFile); + + // + // Distance matrix + // + + DistanceMatrix* dm; + if(scoringMatrixFileName == "!"){ + dm = new DistanceMatrix(sequences, alignAlgorithm, sub, openGapPenalty, extensionGapPenalty); + } + else{ + dm = new DistanceMatrix(scoringMatrixFileName); + } + + // + // Clustering + // + + HierarchicalClusterBuilder* builder; + if(upgma){ + builder = new UPGMA(sequences, dm); + } + else{ + builder = new NeighborJoining(sequences, dm); + } + + // + // ClustalW + // + + ClustalW* cw = new ClustalW(dm, builder, sequences, cutoff, cwOpenGapPenalty, cwExtensionGapPenalty, pam, verbose); + MultipleAlignment* result = cw->performMultipleSequenceAlignment(); + + // + // Newick + // + + PhylogeneticTreeExporter* expoter; + if(newickOutputFileName == "!"){ + expoter = new NewickPhylogeneticTreeExporter(); + cout<<"\nGuide tree:\n"; + } + else{ + expoter = new NewickPhylogeneticTreeExporter(newickOutputFileName); + } + expoter->save(cw->getGuideTree()); + + // + // Output MSA + // + + if(msaOutputFileName != "!"){ + cout<<"\nWriting multiple alignment file "<saveClustal(output) : result->saveFasta(output); + output.close(); + return 0; + } + cout<<"\nResult:\n"<saveClustal(cout) : result->saveFasta(cout); +} + + diff --git a/Phylogenesis/APPS/PhylogeneticTrees.cc b/Phylogenesis/APPS/PhylogeneticTrees.cc new file mode 100644 index 0000000..1bb59df --- /dev/null +++ b/Phylogenesis/APPS/PhylogeneticTrees.cc @@ -0,0 +1,166 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Victor; +using namespace Victor::Align2; +using namespace Victor::Phylogenesis; + +using namespace std; + +void sShowHelp() { + cout<<"\nPHYLOGENETIC TREES GENERATOR" + << "\nThis program calculates phylogenetic trees for the sequences provided as FASTA input." + << "\nIt is possible to provide the scoring matrix as input or it can be dynamically calculated using global," + <<" \nlocal or freeshift alignments." + << "\nSupported algorithms are UPGMA and NJ, the tree is generated in the Newick tree format." + <<" \n" + << "\nOptions:" + << "\n * [--in ] \t Name of input FASTA file" + << "\n [--verbose] \t Verbose mode" + << "\n" + << "\nDistance matrix:" + << "\n [--global] \t Needleman-Wunsch-Gotoh global alignment (default)" + << "\n [--local] \t Smith-Waterman local alignment" + << "\n [--freeshift] \t Free-shift alignment" + << "\n [--matrix ] \t Custom distance matrix" + << "\n [-m ] \t Name of substitution matrix file (default = blosum30.dat)" + << "\n [-o ] \t Open gap penalty (default = 10.00)" + << "\n [-e ] \t Extension gap penalty (default = 0.10)" + << "\n" + << "\nClustering method:" + << "\n [--upgma] \t Unweighted Pair Group Method with Arithmetic Mean (default = Neighbor joining)" + << "\n" + << "\nOutput format:" + << "\n [--out ] \t Name of the Newick tree format output file (default = to screen)" + << "\n [--rooted] \t Reroot the tree with 'mid-point' method (NJ only) " + << "\n\n"; +} + +int main( int argc, char* argv[] ){ + + cout<<"VICTOR - Phylogenetic trees generator\n\n"; + + // + // Init options + // + + string inputFileName, matrixFileName, scoringMatrixFileName, outputFileName; + double openGapPenalty, extensionGapPenalty; + bool verbose, upgma, rooted; + + if (getArg("h", argc, argv) || argc == 1) { + sShowHelp(); + return 1; + } + + getArg("-in", inputFileName, argc, argv, "!"); + getArg("-matrix", scoringMatrixFileName, argc, argv, "!"); + getArg("-out", outputFileName, argc, argv, "!"); + + AlignAlgorithm alignAlgorithm = global; + if(getArg("-local", argc, argv)) + alignAlgorithm = local; + if(getArg("-freeshift", argc, argv)) + alignAlgorithm = freeshift; + + getArg("m", matrixFileName, argc, argv, "blosum30.dat"); + getArg("o", openGapPenalty, argc, argv, 10.0); + getArg("e", extensionGapPenalty, argc, argv, 0.1); + + upgma = getArg("-upgma", argc, argv); + verbose = getArg("-verbose", argc, argv); + rooted = getArg("-rooted", argc, argv); + + // + // Load data + // + + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + string dataPath = path + "data/"; + + vector sequences; + if (inputFileName != "!") { + ifstream inputFile(inputFileName.c_str()); + if (!inputFile) + ERROR("Error opening input FASTA file.", exception); + sequences = Sequence::LoadFasta(inputFile); + if (sequences.size() < 1) + ERROR("Input FASTA file must contain at least two sequences.", exception); + } else + ERROR("PhylogeneticTrees needs input FASTA file.", exception); + + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix* sub = new SubMatrix(matrixFile); + + // + // Distance matrix + // + + DistanceMatrix* dm; + if(scoringMatrixFileName == "!"){ + dm = new DistanceMatrix(sequences, alignAlgorithm, sub, openGapPenalty, extensionGapPenalty); + } + else{ + dm = new DistanceMatrix(scoringMatrixFileName); + } + dm->init(); + + // + // Clustering + // + + HierarchicalClusterBuilder* builder; + if(upgma){ + builder = new UPGMA(sequences, dm); + } + else{ + builder = new NeighborJoining(sequences, dm); + } + + Cluster* tree = builder->performClustering(); + + // + // Newick export + // + + std::cout<<"\nExporting phylogenetic tree in the Newick format...\n"; + PhylogeneticTreeExporter* exporter = outputFileName == "!" ? new NewickPhylogeneticTreeExporter() : new NewickPhylogeneticTreeExporter(outputFileName); + exporter->save(tree); + + if(!upgma && rooted){ + std::cout<<"\nExporting phylogenetic rooted-tree in the Newick format...\n"; + Cluster* rootedTree = Cluster::midpointRooting(tree, sequences.size()); + PhylogeneticTreeExporter* expoter = outputFileName == "!" ? new NewickPhylogeneticTreeExporter() : new NewickPhylogeneticTreeExporter("rooted_"+outputFileName); + expoter->save(rootedTree); + } +} diff --git a/Phylogenesis/Sources/CWGapFunction.cc b/Phylogenesis/Sources/CWGapFunction.cc new file mode 100644 index 0000000..e8d4cde --- /dev/null +++ b/Phylogenesis/Sources/CWGapFunction.cc @@ -0,0 +1,182 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include + +namespace Victor { namespace Phylogenesis { + +// Utility functions + +std::map init(){ + std::map m; + m['A'] = 1.13; + m['C'] = 1.13; + m['D'] = 0.96; + m['E'] = 1.31; + m['F'] = 1.20; + m['G'] = 0.61; + m['H'] = 1.00; + m['I'] = 1.32; + m['K'] = 0.96; + m['L'] = 1.21; + m['M'] = 1.29; + m['N'] = 0.63; + m['P'] = 0.74; + m['Q'] = 1.07; + m['R'] = 0.72; + m['S'] = 0.76; + m['T'] = 0.89; + m['V'] = 1.25; + m['Y'] = 1.00; + m['W'] = 1.23; + return m; +} + +static std::map PascarellaAndArgosSpecificGapModificationFactors = init(); + +int countGaps(const std::vector& seqs, int pos){ + int tot = 0; + for(unsigned int i = 0; iisGap(pos)){ + tot++; + } + } + return tot; +} + +CWGapFunction::CWGapFunction(double initialGop, double initiapGep, const std::vector& vertical, const std::vector& horizontal) : + initialGop(initialGop), initialGep(initiapGep) { + verticalGops.reserve(vertical.size()); + verticalGeps.reserve(vertical.size()); + horizontalGops.reserve(horizontal.size()); + horizontalGeps.reserve(horizontal.size()); + initGapValues(vertical, verticalGops, verticalGeps); + initGapValues(horizontal, horizontalGops, horizontalGeps); +} + +CWGapFunction::~CWGapFunction() {} + +void CWGapFunction::initGapValues(const std::vector& seqs, std::vector& gops, std::vector& geps){ + + //Init position-specific gap penalties + unsigned int totalPos = seqs[0]->sequence.size(); + + for(unsigned int POS=0; POSisGap(POS)){ + withGap++; + } + else{ + withoutGap++; + } + } + + //1) Lowered gap penalties at existing gaps + + geps.push_back(withGap > 0 ? (initialGep / 2.0) : initialGep); + if(withGap>0){ + gops.push_back(initialGop*0.3*((double)withoutGap/seqs.size())); + continue; + } + + //2) If a position does not have any gaps but is within 8 residues of an existing gap, the GOP is increased + // by GOP -> GOP*(2+((8-distance from gap)*2)/8) + + bool closeGap = false; + int distance = 0; + for(unsigned int i=1;i<=8 && !closeGap ;i++){ + distance++; + if(countGaps(seqs,POS+i) > 0 || countGaps(seqs,POS-i) > 0){ + closeGap = true; + } + } + if(closeGap){ + gops.push_back(initialGop*(2.0+((8.0-(double)distance)*2.0)/8.0)); + continue; + } + + //3) Reduced gap penalties in hydrophilic stretches. + // Any run of 5 hydrophilic residues is considered to be a hydrophilic stretch. + + bool foundHydrophilicStretch = false; + for(unsigned int i =0; i< seqs.size() && !foundHydrophilicStretch ;i++){ + for(int c = (int)POS-4; c <= (int)POS && !foundHydrophilicStretch; c++){ + foundHydrophilicStretch = seqs[i]->isHydrophilicStretch(c); + } + } + if(foundHydrophilicStretch){ + gops.push_back(initialGop* 2.0/3.0); + continue; + } + + //4) Residue specific penalties + // If there is no hydrophilic stretch and the position does not contain any gaps, then the GOP is multiplied by + // Pascarella and Argos modification factors. If there is a mixture of residues at a position, the multiplication + // factor is the average of all the contributions from each sequence. + + double avg = 0; + for(unsigned int i =0; i< seqs.size() ;i++){ + avg += PascarellaAndArgosSpecificGapModificationFactors[seqs[i]->sequence[POS]]; + } + avg = avg / seqs.size(); + gops.push_back(initialGop * avg); + } + + #ifdef DEBUG + for(unsigned int POS=0; POSsequence; + for(unsigned int j=0;j. +*/ + +#ifndef __CWGapFunction_h__ +#define __CWGapFunction_h__ + +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis { + + /** @brief Implements ClustalW Multiple Sequence Alignment (MSA) gap function + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class CWGapFunction: public MsaGapFunction { + public: + + /// Default constructor. + CWGapFunction(double initialGop, double initiapGep, const std::vector& vertical, const std::vector& horizontal); + + /// Destructor. + virtual ~CWGapFunction(); + + /// Return open gap penalty for horizontal seq at position p. + double getHorizontalOpenPenalty(int pos); + + /// Return extension gap penalty for horizontal seq at position p. + double getHorizontalExtensionPenalty(int pos); + + /// Return open gap penalty for vertical seq at position p. + double getVerticalOpenPenalty(int pos); + + /// Return extension gap penalty for vertical seq at position p. + double getVerticalExtensionPenalty(int pos); + + private: + + // ATTRIBUTES: + double initialGop, initialGep; + std::vector horizontalGops; + std::vector verticalGops; + std::vector horizontalGeps; + std::vector verticalGeps; + + /// Init initial gap values + void initGapValues(const std::vector& seqs, std::vector& gops, std::vector& geps); + }; +}} + +#endif diff --git a/Phylogenesis/Sources/CWScoringScheme.cc b/Phylogenesis/Sources/CWScoringScheme.cc new file mode 100644 index 0000000..7fcadab --- /dev/null +++ b/Phylogenesis/Sources/CWScoringScheme.cc @@ -0,0 +1,43 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include "CWScoringScheme.h" + +namespace Victor { namespace Phylogenesis{ + + CWScoringScheme::CWScoringScheme(SubMatrix *sub, std::vector vertical, std::vector horizontal) : + vertical(vertical), horizontal(horizontal), sub(sub){ + } + + double CWScoringScheme::scoring(int i, int j){ + + //notice that in the alignment algorithm sequences indices start at 1 and not 0 + double score = 0; + for(unsigned int ii=0; iisequence[i-1]; + for(unsigned int jj=0; jjsequence[j-1]; + //score with gap equals 0 + if(v_i != '-' && h_j != '-'){ + score += (sub->score[v_i][h_j] * vertical[ii]->normalizedWeight * horizontal[jj]->normalizedWeight); + } + } + } + return score/(double)(horizontal.size()*vertical.size()); + } + +}} diff --git a/Phylogenesis/Sources/CWScoringScheme.h b/Phylogenesis/Sources/CWScoringScheme.h new file mode 100644 index 0000000..cd42a5e --- /dev/null +++ b/Phylogenesis/Sources/CWScoringScheme.h @@ -0,0 +1,53 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __ClustalWScoringScheme_h__ +#define __ClustalWScoringScheme_h__ + +#include +#include +#include + +using namespace Victor::Align2; + +namespace Victor { namespace Phylogenesis{ + + /** @brief Implements ClustalW Multiple Sequence Alignment (MSA) scoring function. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class CWScoringScheme : public MsaScoringScheme{ + + public: + + /// Default constructor. + CWScoringScheme(SubMatrix *sub, std::vector vertical, std::vector horizontal); + + /// Scoring function. + virtual double scoring(int i, int j); + + private: + + // ATTRIBUTES + + SubMatrix* sub; + + std::vector horizontal, vertical; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/ClustalW.cc b/Phylogenesis/Sources/ClustalW.cc new file mode 100644 index 0000000..af9efdd --- /dev/null +++ b/Phylogenesis/Sources/ClustalW.cc @@ -0,0 +1,266 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + +ClustalW::ClustalW(DistanceMatrix* dm, HierarchicalClusterBuilder* clusterBuilder, vector sequences, double cutoff, double gop, double gep, bool pam, bool verbose) + : dm(dm), clusterBuilder(clusterBuilder), sequences(sequences), cutoff(cutoff), gop(gop), gep(gep), pam(pam), verbose(verbose){ + subMatrixFactory = new SubMatrixFactory(true); +} + +MultipleAlignment* ClustalW::performMultipleSequenceAlignment(){ + + // Stage 1: all pairs of sequences are aligned separately in order + // to calculate a distance matrix giving the divergence of each pair + // of sequences; + + dm->init(); + bool divSeqs = selectMostDivergentSequences(); + + // Stage 2: guide tree is calculated from the distance matrix; + + guideTree = clusterBuilder->performClustering(); + if(!guideTree->isRooted()){ + // Midpoint root for NJ clustering + guideTree = Cluster::midpointRooting(guideTree, sequences.size()); + } + + calculatesWeights(guideTree, 0.0); + double max = getMaxWeight(guideTree, 0.0); + normalizesWeights(guideTree, max); + //print(guideTree); + + // Stage 3: the sequences are progressively aligned according + // to the branching order in the guide tree; + // The basic procedure at this stage is to use a series of pairwise + // alignments to align larger and larger groups of sequences, + // following the branching order in the guide tree. + + cout<<"Start of Multiple Alignment\n\nAligning..."< result = alignCluster(guideTree); + + if(divSeqs){ + cout<<"\nAligning most divergent sequences"<::max(); + for(unsigned int j=0;jidentityAt(sequences[i]->id,sequences[j]->id); + } + avg = avg/(sequences.size()-1); + if(avg < cutoff){ + sequences[i]->divergent = true; + cout<<"Warning seq "<id<<" is divergent (avg identity score "< ClustalW::alignMostDivergentSequences(vector msa){ + + vector div = vector(); + for(unsigned int i=0;idivergent){ + div.push_back(sequences[i]); + } + } + + while(!div.empty()){ + + int it = 0; + vector lessDiv = vector(); + lessDiv.push_back(div[0]); + + double pcid = getPercentIdentity(lessDiv, msa); + for(unsigned int i=1;i tmp = vector(); + tmp.push_back(div[i]); + double currentPcid = getPercentIdentity(tmp, msa); + if(currentPcid > pcid){ + it = i; + pcid = currentPcid; + lessDiv = tmp; + } + } + if(msa.size() > 0){ + msa = multipleSequenceAlignment(msa, lessDiv); + } + else{ + msa = lessDiv; + } + + div.erase(div.begin()+it); + } + + return msa; +} + +vector ClustalW::alignCluster(Cluster* c){ + if(c->isSingle()){ + SingleCluster* sc = static_cast(c); + vector res = vector(); + if(!sc->sequence->divergent){ + res.push_back(sc->sequence); + } + return res; + } + + vector leftMsa = alignCluster(c->branches[0]); + vector rightMsa = alignCluster(c->branches[1]); + return multipleSequenceAlignment(leftMsa, rightMsa); +} + +vector ClustalW::multipleSequenceAlignment(const vector& left, const vector& right){ + + //For divergent seqs + if(left.size() == 0 && right.size() == 0){ + cout<<"Delayed"<(); + } + if(left.size() == 0){ + cout<<"Delayed"<getAverageMismatchScore() * scale; + double initialGep = gep * (1.0 + std::abs(log(leftLength/rightLength))); + + CWScoringScheme* ss = new CWScoringScheme(sub, left, right); + CWGapFunction *gf = new CWGapFunction(initialGop, initialGep, left, right); + MsaNWG* nwmsa = new MsaNWG(ss, gf); + std::vector result = nwmsa->getMatch(left, right); + + cout<<"Sequences: "<getScore()<id; + for(unsigned int i=1;iid; + cout<<"] with ["<id; + for(unsigned int i=1;iid; + cout<<"]\n initial gop "<& left, const vector& right){ + double totalDistance = 0; + for(unsigned int i=0;iidentityAt(left[i]->id, right[j]->id)); + } + } + double pcid = totalDistance/(left.size()*right.size()); + return pcid*100; +} + +void ClustalW::getSubMatrix(double distance, SubMatrix*& sub, double& scale){ + if(pam){ + //Pam series + scale=0.75; + if(distance > 80.0){ + sub = subMatrixFactory->getSubMatrix(PAM20); + } + else if(distance > 60.0){ + sub = subMatrixFactory->getSubMatrix(PAM60); + } + else if(distance > 40.0){ + sub = subMatrixFactory->getSubMatrix(PAM120); + } + else{ + sub = subMatrixFactory->getSubMatrix(PAM350); + } + } + else{ + //Blosum series + scale = 0.75; + if(distance > 80.0){ + sub = subMatrixFactory->getSubMatrix(BLOSUM80); + } + else if(distance > 60.0){ + sub = subMatrixFactory->getSubMatrix(BLOSUM62); + } + else if (distance > 40.0) + { + sub = subMatrixFactory->getSubMatrix(BLOSUM45); + } + else if(distance > 30.0){ + scale = 0.5; + sub = subMatrixFactory->getSubMatrix(BLOSUM45); + } + else if (distance > 20.0) + { + scale=0.6; + sub = subMatrixFactory->getSubMatrix(BLOSUM45); + } + else + { + scale=0.6; + sub = subMatrixFactory->getSubMatrix(BLOSUM30); + } + } + scale *= 0.5; +} + +Cluster* ClustalW::getGuideTree(){ + return guideTree; +} + +}} diff --git a/Phylogenesis/Sources/ClustalW.h b/Phylogenesis/Sources/ClustalW.h new file mode 100644 index 0000000..80a21aa --- /dev/null +++ b/Phylogenesis/Sources/ClustalW.h @@ -0,0 +1,72 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __Clustal_W__ +#define __Clustal_W__ + +#include +#include +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief Multiple Sequence Alignment with ClustalW heuristic. + * (Thompson, Higgins & Gibson, 1994). + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class ClustalW + { + public: + /// Constructor. + ClustalW(DistanceMatrix* dm, HierarchicalClusterBuilder* clusterBuilder, vector sequences, + double cutoff, double gop, double gep, bool pam, bool verbose); + + /// Perform a Multiple Sequence Alignment. + MultipleAlignment* performMultipleSequenceAlignment(); + + /// Returns the guide tree + Cluster* getGuideTree(); + + private: + + // ATTRIBUTES: + + vector sequences; + double cutoff, gop, gep; + bool pam, verbose; + Cluster* guideTree; + DistanceMatrix* dm; + HierarchicalClusterBuilder* clusterBuilder; + SubMatrixFactory* subMatrixFactory; + + // FUNCTIONS: + + vector multipleSequenceAlignment(const vector& left, const vector& right); + vector alignCluster(Cluster* c); + vector alignMostDivergentSequences(vector msa); + double getPercentIdentity(const vector& left, const vector& right); + void getSubMatrix(double distance, SubMatrix*& sub, double& scale); + bool selectMostDivergentSequences(); + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/Cluster.cc b/Phylogenesis/Sources/Cluster.cc new file mode 100644 index 0000000..6e5dbe7 --- /dev/null +++ b/Phylogenesis/Sources/Cluster.cc @@ -0,0 +1,145 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include "Cluster.h" +#include "SingleCluster.h" +#include +#include + +namespace Victor { namespace Phylogenesis{ + +vector Cluster::leafs; + +void Cluster::initUnrootedTree(Cluster* cluster, Cluster* parent, double dist, int nseqs){ + cluster->dirs.resize(nseqs, -1); + if(parent != NULL){ + cluster->branches.push_back(parent); + cluster->lengths.push_back(dist); + } + if(cluster->isSingle()){ + leafs.push_back(cluster); + return; + } + for(unsigned int i=0;ibranches.size();i++){ + if(cluster->branches[i] != parent){ + initUnrootedTree(cluster->branches[i], cluster, cluster->lengths[i], nseqs); + } + } +} + +double Cluster::maxFarthestLeaf(Cluster* cluster, Cluster* parent, double dist, int id){ + + if(cluster->isSingle() && parent != NULL){ + return dist; + } + + double max = std::numeric_limits::min(); + int dir = -1; + + for(unsigned int i=0;ibranches.size();i++){ + if(cluster->branches[i] != parent){ + double d = maxFarthestLeaf(cluster->branches[i], cluster, dist + cluster->lengths[i], id); + if(d > max){ + max = d; + dir = i; + } + } + } + + cluster->dirs[id] = dir; + return max; +} + + +void Cluster::removeParent(Cluster* cluster, Cluster* parent){ + if(parent != NULL){ + int parentId = -1; + for(unsigned int i=0;ibranches.size();i++){ + if(cluster->branches[i] == parent){ + parentId = i; + } + } + cluster->branches.erase(cluster->branches.begin()+parentId); + cluster->lengths.erase(cluster->lengths.begin()+parentId); + cluster->dirs.clear(); + } +} + +void Cluster::removeParentRec(Cluster* cluster, Cluster* parent){ + removeParent(cluster, parent); + + if(cluster->isSingle()){ + return; + } + + for(unsigned int i=0;ibranches.size();i++){ + removeParentRec(cluster->branches[i], cluster); + } +} + +Cluster* Cluster::addHalfwayRoot(Cluster* cluster, Cluster* parent, double dist, int id, double avg, double lastDist){ + + if(dist < avg){ + Cluster* next = cluster->branches[cluster->dirs[id]]; + double nextDist = cluster->lengths[cluster->dirs[id]]; + return addHalfwayRoot(next, cluster, dist + nextDist, id, avg, nextDist); + } + + Cluster* root = new Cluster(); + root->branches.push_back(parent); + root->branches.push_back(cluster); + root->lengths.push_back(avg - (dist - lastDist)); + root->lengths.push_back(dist - avg); + + //Remove reverse edges + removeParent(cluster, parent); + removeParent(parent, cluster); + removeParentRec(cluster, NULL); + removeParentRec(parent, NULL); + + return root; +} + +Cluster* Cluster::midpointRooting(Cluster* tree, int leafsnum){ + + // Init the current cluster with parameters useful for making the rooted tree + // like reverse edges and extract the leafs + + leafs.clear(); + initUnrootedTree(tree, NULL, 0.0, leafsnum); + + // For each leafs find the other most farthest leaf and keeps the max + + int maxId = -1; + double maxDist = std::numeric_limits::min(); + + for(unsigned int i=0;i(leafs[i]); + double dist = maxFarthestLeaf(leafs[i], NULL, 0.0, leaf->sequence->id); + if(dist > maxDist){ + maxDist = dist; + maxId = i; + } + //cout<<"Leaf id "<sequence->id<<" with farthest leaf at "<(leafs[maxId]); + Cluster* rootedTree = addHalfwayRoot(l, NULL, 0.0, l->sequence->id, maxDist/2.0, 0.0); + return rootedTree; +} + +}} diff --git a/Phylogenesis/Sources/Cluster.h b/Phylogenesis/Sources/Cluster.h new file mode 100644 index 0000000..6f05c1f --- /dev/null +++ b/Phylogenesis/Sources/Cluster.h @@ -0,0 +1,108 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __Cluster_H__ +#define __Cluster_H__ + +#include + +using namespace std; + +namespace Victor { namespace Phylogenesis{ + + /** @brief It represents the group of one or more clusters (recursive). + * Given a clustering, each Cluster is a node of the clustering tree. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class Cluster{ + public: + + /// Default constructor. + Cluster() : distance(0), rooted(false){ + } + + /// Destructor. + virtual ~Cluster(){ + } + + /// Check if the current cluster not contains no other clusters. + virtual bool isSingle(); + + /// Check if the cluster is rooted + bool isRooted(); + + /// Set if the cluster is rooted + void setRooted(bool rt); + + // ATTRIBUTES: + + /// Pointers to the successors of of the current cluster. + std::vector branches; + + /// Length of the branches from the current cluster to the successors. + std::vector lengths; + + /// Distance of the current cluster. + double distance; + + /// Indices of the sequences contained in the current cluster. + std::vector indices; + + /// Reroot the current tree with midpoint method + static Cluster* midpointRooting(Cluster* tree, int leafsnum); + + private: + + // ATTRIBUTES + + /// Cluster is rooted + bool rooted; + + /// Directions used for root the tree + std::vector dirs; + + /// Current cluster's leafs + static vector leafs; + + // Static methods for making the rooted tree by midpoint method + + static void initUnrootedTree(Cluster* cluster, Cluster* parent, double dist, int nseqs); + + static double maxFarthestLeaf(Cluster* cluster, Cluster* parent, double dist, int id); + + static void removeParent(Cluster* cluster, Cluster* parent); + + static void removeParentRec(Cluster* cluster, Cluster* parent); + + static Cluster* addHalfwayRoot(Cluster* cluster, Cluster* parent, double dist, int id, double avg, double lastDist); + }; + + inline bool Cluster::isSingle(){ + return false; + } + + inline bool Cluster::isRooted(){ + return rooted; + } + + inline void Cluster::setRooted(bool rt){ + rooted = rt; + } + +}} + +#endif diff --git a/Phylogenesis/Sources/ClusterDistanceMatrix.cc b/Phylogenesis/Sources/ClusterDistanceMatrix.cc new file mode 100644 index 0000000..348382e --- /dev/null +++ b/Phylogenesis/Sources/ClusterDistanceMatrix.cc @@ -0,0 +1,71 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include "ClusterDistanceMatrix.h" + +namespace Victor { namespace Phylogenesis{ + + ClusterDistanceMatrix::ClusterDistanceMatrix(){} + + double ClusterDistanceMatrix::at(Cluster* c1, Cluster* c2) + { + if(distanceClusters.find(c1) != distanceClusters.end() && distanceClusters.at(c1).find(c2) != distanceClusters.at(c1).end()){ + return distanceClusters.at(c1).at(c2); + } + return distanceClusters.at(c2).at(c1); + } + + void ClusterDistanceMatrix::erase(Cluster* c1, Cluster* c2) + { + std::map > updatedDistanceClusters; + for (std::map >::iterator it=distanceClusters.begin(); it!=distanceClusters.end(); ++it) + { + if(it->first == c1 || it->first == c2) + continue; + + std::map row = it->second; + std::map updatedRow; + for(std::map::iterator itr=row.begin(); itr!=row.end(); ++itr) + { + if(itr->first == c1 || itr->first == c2) + continue; + + updatedRow.insert(*itr); + } + updatedDistanceClusters.insert(std::pair >(it->first, updatedRow)); + } + distanceClusters = updatedDistanceClusters; + } + + void ClusterDistanceMatrix::insert(Cluster* c1, Cluster* c2, double distance) + { + if(distanceClusters.find(c1) == distanceClusters.end() && distanceClusters.find(c2) != distanceClusters.end()){ + distanceClusters[c2].insert(std::pair(c1, distance)); + } + else if(distanceClusters.find(c2) == distanceClusters.end() && distanceClusters.find(c1) != distanceClusters.end()){ + distanceClusters[c1].insert(std::pair(c2, distance)); + } + else{ + std::map row; + distanceClusters.insert(std::pair >(c1, row)); + distanceClusters[c1].insert(std::pair(c2, distance)); + } + } + + void ClusterDistanceMatrix::clear(){ + distanceClusters.clear(); + } +}} diff --git a/Phylogenesis/Sources/ClusterDistanceMatrix.h b/Phylogenesis/Sources/ClusterDistanceMatrix.h new file mode 100644 index 0000000..1f60b0c --- /dev/null +++ b/Phylogenesis/Sources/ClusterDistanceMatrix.h @@ -0,0 +1,58 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __ClusterMatrix_H__ +#define __ClusterMatrix_H__ + +#include + +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief Implements a triangular matrix usefull to store distances between Cluster without wasting memory. + * Access and update operations are performed in constant time, erase in linear time. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class ClusterDistanceMatrix{ + public: + /// Default constructor. + ClusterDistanceMatrix(); + + /// Access element + double at(Cluster* c1, Cluster* c2); + + /// Erase all (c1,*), (c2,*), (*,c1), (*,c2) elements in the matrix. + void erase(Cluster* c1, Cluster* c2); + + /// Add a new value to the matrix at (c1,c2) + void insert(Cluster* c1, Cluster* c2, double distance); + + /// Clear the matrix + void clear(); + + private: + + // ATTRIBUTES + + std::map > distanceClusters; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/ClusterUtils.h b/Phylogenesis/Sources/ClusterUtils.h new file mode 100644 index 0000000..e9d59a4 --- /dev/null +++ b/Phylogenesis/Sources/ClusterUtils.h @@ -0,0 +1,107 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __ClusterUtils_h__ +#define __ClusterUtils_h__ + +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /// Count number of sequences in the current cluster + static int countSequences(Cluster* c){ + if(c->isSingle()){ + return 1; + } + int tot = 0; + for(unsigned int i = 0; i < c->branches.size(); i++){ + tot+= countSequences(c->branches[i]); + } + return tot; + } + + /// Calculate weight sequences in the current cluster + static void calculatesWeights(Cluster* c, double weight){ + if(c->isSingle()){ + SingleCluster* cs = static_cast(c); + cs->sequence->weight = weight; + return; + } + for(unsigned int i = 0; i < c->branches.size(); i++){ + calculatesWeights(c->branches[i], weight + (c->lengths[i]/countSequences(c->branches[i]))); + } + } + + /// Get max weight in the cluster + static double getMaxWeight(Cluster* c, double max){ + if(c->isSingle()){ + SingleCluster* cs = static_cast(c); + if(cs->sequence->weight > max){ + max = cs->sequence->weight; + } + } + for(unsigned int i = 0; i < c->branches.size(); i++){ + double m = getMaxWeight(c->branches[i], max); + if(m > max){ + max = m; + } + } + return max; + } + + /// Normalizes weights in the current cluster + static void normalizesWeights(Cluster* c, double norm){ + if(c->isSingle()){ + SingleCluster* cs = static_cast(c); + cs->sequence->normalizedWeight = cs->sequence->weight / norm; + return; + } + for(unsigned int i = 0; i < c->branches.size(); i++){ + normalizesWeights(c->branches[i], norm); + } + } + + /// Returns the sum of all weights in the current cluster + static double countWeights(Cluster* c){ + if(c->isSingle()){ + SingleCluster* cs = static_cast(c); + return cs->sequence->weight; + } + double tot = 0; + for(unsigned int i = 0; i < c->branches.size(); i++){ + tot+= countWeights(c->branches[i]); + } + return tot; + } + + /// Prints weights and normalized weights in the current cluster + static void print(Cluster* c){ + if(c->isSingle()){ + SingleCluster* cs = static_cast(c); + string name = cs->sequence->name; + name = name.substr(0,name.find(' ')); + cout<sequence->weight<<" "<sequence->normalizedWeight<branches.size(); i++){ + print(c->branches[i]); + } + } + +}} + +#endif diff --git a/Phylogenesis/Sources/DistanceMatrix.cc b/Phylogenesis/Sources/DistanceMatrix.cc new file mode 100644 index 0000000..d1e90ee --- /dev/null +++ b/Phylogenesis/Sources/DistanceMatrix.cc @@ -0,0 +1,186 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + DistanceMatrix::~DistanceMatrix(){} + + DistanceMatrix::DistanceMatrix(std::string fileName){ + + std::ifstream inputFile(fileName.c_str()); + if (!inputFile) + ERROR("Error opening distance matrix file.", exception); + + distanceMatrix.push_back(vector()); + identityMatrix.push_back(vector()); + int index_i = 0; + int index_j = 1; + + std::string line; + while (std::getline(inputFile, line)) + { + std::istringstream iss(line); + int a, b; + double c; + if (!(iss >> a >> b >> c)) + ERROR("Error reading distance matrix file.", exception); + + if(a == index_i && b == index_j) + { + distanceMatrix[a].push_back(c); + identityMatrix[a].push_back(1-c); + index_j++; + continue; + } + + if(a == index_i+1 && b == index_i+2) + { + distanceMatrix.push_back(vector()); + identityMatrix.push_back(vector()); + distanceMatrix[a].push_back(c); + identityMatrix[a].push_back(1-c); + index_i = a; + index_j = b + 1; + continue; + } + + ERROR("Error reading input distance matrix file, invalid index.", exception); + } + } + + DistanceMatrix::DistanceMatrix(vector sequences, AlignAlgorithm alignAlgorithm, SubMatrix* sub, double gop, double gep) : + alignAlgorithm(alignAlgorithm), sequences(sequences), sub(sub) { + gf = new AGPFunction(gop, gep); + } + + void DistanceMatrix::init(){ + + if(distanceMatrix.size() > 0) + return; + + cout<<"Start of Pairwise alignments\nAligning...\n"< result = pairwiseSequenceAlign(sequences[i]->sequence, sequences[j]->sequence); + double distanceScore = calculateDistance(result[0], result[1]); + appendDistance(i, distanceScore); + double identityScore = calculateIdentity(result[0], result[1]); + appendIdentity(i, identityScore); + //Output + cout<id+1<<":"<id+1<<") Aligned. Identity Score: "<= (int)distanceMatrix.size()) + distanceMatrix.push_back(vector()); + + distanceMatrix[i].push_back(value); + } + + void DistanceMatrix::appendIdentity(int i, double value){ + if(i >= (int)identityMatrix.size()) + identityMatrix.push_back(vector()); + + identityMatrix[i].push_back(value); + } + + double DistanceMatrix::at(int i, int j){ + int ii = std::min(i,j); + int jj = std::max(i,j); + + return distanceMatrix[ii][jj-ii-1]; + } + + double DistanceMatrix::identityAt(int i, int j){ + int ii = std::min(i,j); + int jj = std::max(i,j); + + return identityMatrix[ii][jj-ii-1]; + } + + vector DistanceMatrix::pairwiseSequenceAlign(string s1, string s2){ + SequenceData *ad = new SequenceData(2, s1, s2); + ScoringS2S *ss = new ScoringS2S(sub, ad, 0, 1.00); + Align* a; + switch(alignAlgorithm) { + case global: + a = new NWGAlign(ad, gf, ss); + break; + case local: + a = new SWAlign(ad, gf, ss); + break; + default: + a = new FSAlign(ad, gf, ss); + break; + } + + vector res = a->getMatch(); + delete a; + return res; + } + +}} diff --git a/Phylogenesis/Sources/DistanceMatrix.h b/Phylogenesis/Sources/DistanceMatrix.h new file mode 100644 index 0000000..390972b --- /dev/null +++ b/Phylogenesis/Sources/DistanceMatrix.h @@ -0,0 +1,93 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __DistanceMatrix_H__ +#define __DistanceMatrix_H__ + +#include +#include +#include +#include +#include + +using namespace Victor::Align2; + +namespace Victor { namespace Phylogenesis{ + + /// Align algorithm type + enum AlignAlgorithm { global = 0, local = 1, freeshift = 2 }; + + /** @brief Implements a triangular matrix useful to store distances between sequences without wasting memory. + * Access and update operations are performed in constant time. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class DistanceMatrix{ + public: + + DistanceMatrix(){}; + + /// Constructor that init the matrix from file. The file must be a sequence indexes of in the correct order, followed by the value (see samples/distance.dat). + DistanceMatrix(std::string fileName); + + /// Constructor. + DistanceMatrix(vector sequences, AlignAlgorithm alignAlgorithm, SubMatrix* sub, double gop, double gep); + + /// Destructor. + virtual ~DistanceMatrix(); + + /// Init the matrix from sequences calculating the pairwise alignments. + void init(); + + /// Add a new value to the matrix, insertion order must be respected. + void appendDistance(int i, double value); + + /// Add a new value to the matrix, insertion order must be respected. + void appendIdentity(int i, double value); + + /// Distance value access element + double at(int i, int j); + + /// Identity value access element + double identityAt(int i, int j); + + protected: + + //// ClustalW distance score from two optimal aligned sequences + virtual double calculateDistance(string alignedS1, string alignedS2); + + //// ClustalW identity score from two optimal aligned sequences + virtual double calculateIdentity(string alignedS1, string alignedS2); + + private: + + // ATTRIBUTES: + + std::vector > distanceMatrix; + std::vector > identityMatrix; + + GapFunction* gf; + SubMatrix* sub; + AlignAlgorithm alignAlgorithm; + vector sequences; + + /// Align 2 sequences + vector pairwiseSequenceAlign(string s1, string s2); + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/HierarchicalClusterBuilder.h b/Phylogenesis/Sources/HierarchicalClusterBuilder.h new file mode 100644 index 0000000..64eed5d --- /dev/null +++ b/Phylogenesis/Sources/HierarchicalClusterBuilder.h @@ -0,0 +1,41 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __HierarchicalClustering_H__ +#define __HierarchicalClustering_H__ + +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief Abstract class for hierarchical clustering. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class HierarchicalClusterBuilder + { + public: + /// Destructor. + virtual ~HierarchicalClusterBuilder(){} + /// Perform the hierarchical clustering and returns the clustering tree. + virtual Cluster* performClustering() = 0; + }; + +}} + +#endif + + diff --git a/Phylogenesis/Sources/Makefile b/Phylogenesis/Sources/Makefile new file mode 100644 index 0000000..873a8a1 --- /dev/null +++ b/Phylogenesis/Sources/Makefile @@ -0,0 +1,58 @@ + +#--*- makefile -*-------------------------------------------------------------- +# +# Standard makefile +# +#------------------------------------------------------------------------------ + +# Path to project directory +UPDIR = ../.. +# Path to subdirectories +SUBDIR = +# Path to directory for binaries +BINPATH = ../../bin + +# +# Libraries and paths (which are not defined globally) +# + +LIBS = -lAlign2 -lPhylogenesis -lBiopool -ltools -lPhylogenesis + +LIB_PATH = -L. + +INC_PATH = -I. -I../../Align2/Sources -I ../../Biopool/Sources -I../../tools/ + +# +# Objects and headers +# + +SOURCES = DistanceMatrix.cc ClusterDistanceMatrix.cc UPGMA.cc NewickPhylogeneticTreeExporter.cc NeighborJoining.cc \ + ClustalW.cc SubMatrixFactory.cc \ + MsaNWG.cc CWScoringScheme.cc CWGapFunction.cc Sequence.cc \ + MultipleAlignment.cc Cluster.cc + +OBJECTS = DistanceMatrix.o ClusterDistanceMatrix.o UPGMA.o NewickPhylogeneticTreeExporter.o \ + NeighborJoining.o ClustalW.o \ + MsaNWG.o CWScoringScheme.o CWGapFunction.o Sequence.o SubMatrixFactory.o \ + MultipleAlignment.o Cluster.o Sequence.o + +TARGETS = + +EXECS = + +LIBRARY = libPhylogenesis.a + +# +# Install rule +# + +compile: all + +install: $(LIBRARY) $(TARGETS) + mv $(LIBRARY) $(UPDIR)/lib + +# +# Call global Makefile to do the job +# +all: install +include ../../Makefile.global diff --git a/Phylogenesis/Sources/MsaGapFunction.h b/Phylogenesis/Sources/MsaGapFunction.h new file mode 100644 index 0000000..6980ebf --- /dev/null +++ b/Phylogenesis/Sources/MsaGapFunction.h @@ -0,0 +1,50 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __MsaGapFunction_h___ +#define __MsaGapFunction_h___ + +namespace Victor { namespace Phylogenesis { + + /** @brief Base class for Multiple Sequence Alignment (MSA) gap functions. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class MsaGapFunction{ + public: + + /// Default constructor. + MsaGapFunction(){} + + /// Destructor. + virtual ~MsaGapFunction(){} + + /// Return open gap penalty for horizontal seq at position p. + virtual double getHorizontalOpenPenalty(int pos) = 0; + + /// Return extension gap penalty for horizontal seq at position p. + virtual double getHorizontalExtensionPenalty(int pos) = 0; + + /// Return open gap penalty for vertical seq at position p. + virtual double getVerticalOpenPenalty(int pos) = 0; + + /// Return extension gap penalty for vertical seq at position p. + virtual double getVerticalExtensionPenalty(int pos) = 0; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/MsaNWG.cc b/Phylogenesis/Sources/MsaNWG.cc new file mode 100644 index 0000000..4602d30 --- /dev/null +++ b/Phylogenesis/Sources/MsaNWG.cc @@ -0,0 +1,234 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include "MsaNWG.h" + +namespace Victor { namespace Phylogenesis { + +static int DIAG = 0; +static int LEFT = 1; +static int UP = 2; + +MsaNWG::MsaNWG(MsaScoringScheme* ss, MsaGapFunction* gf) : + ss(ss), gf(gf){ +} + +std::vector MsaNWG::getMatch(const vector& vertical, const vector& horizontal){ + + n = vertical[0]->sequence.length(); + m = horizontal[0]->sequence.length(); + + F = vector >(n + 1); + F1 = vector >(n + 1); + F2 = vector >(n + 1); + D = vector >(n + 1); + D1 = vector >(n + 1); + D2 = vector >(n + 1); + + vector frow(m + 1, -std::numeric_limits::max()); + vector drow(m + 1, -1); + + for (unsigned int i = 0; i < F.size(); ++i) { + F[i] = frow; + F1[i] = frow; + F2[i] = frow; + D[i] = drow; + D1[i] = drow; + D2[i] = drow; + } + + calculateMatrix(vertical, horizontal); + + //Traceback + + string res1; + string res2; + + int i = n; + int j = m; + int nextI = -1; + int nextJ = -1; + + double val = max(max(F[n][m], F1[n][m]), F2[n][m]); + int K = -1; + + if(val == F[n][m]){ + K = DIAG; + } + else if(val == F2[n][m]){ + K = LEFT; + } + else if(val == F1[n][m]){ + K = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + while (i > 0 || j > 0) { + if(K == DIAG){ + nextI = i-1; + nextJ = j-1; + K = D[i][j]; + } + else if(K == UP){ + nextI = i-1; + nextJ = j; + K = D1[i][j]; + } + else if(K == LEFT){ + nextI = i; + nextJ = j-1; + K = D2[i][j]; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + // Here reports only where there are gaps in the profile alignment + if (i == nextI) { + res1 = res1 + "-"; + } else { + res1 = res1 + "*"; + } + if (j == nextJ) { + res2 = res2 + "-"; + } else { + res2 = res2 + "*"; + } + + i = nextI; + j = nextJ; + } + + reverse(res1.begin(), res1.end()); + reverse(res2.begin(), res2.end()); + + //Insert gaps + + for (unsigned int i = 0; i < res1.length(); ++i){ + if(res1[i] == '-'){ + for(unsigned int j = 0; j < vertical.size(); ++j){ + vertical[j]->sequence.insert(i, 1, '-'); + } + } + } + for (unsigned int i = 0; i < res2.length(); ++i){ + if(res2[i] == '-'){ + for(unsigned int j = 0; j < horizontal.size(); ++j){ + horizontal[j]->sequence.insert(i, 1, '-'); + } + } + } + + std::vector msa(vertical); + msa.insert(msa.end(),horizontal.begin(),horizontal.end()); + return msa; +} + +void MsaNWG::calculateMatrix(const vector& vertical, const vector& horizontal) +{ + F[0][0] = F1[0][0] = F2[0][0] = 0; + + //Init first column + F1[1][0] = 0; + D1[1][0] = UP; + F[1][0] = F2[1][0] = -std::numeric_limits::max(); + for (int i = 2; i <= static_cast (n); i++) { + F1[i][0] = F1[i-1][0] -gf->getVerticalExtensionPenalty(i); + D1[i][0] = UP; + F[i][0] = F2[i][0] = -std::numeric_limits::max(); + } + + //Init first row + F2[0][1] = 0; + D2[0][1] = LEFT; + F[0][1] = F1[0][1] = -std::numeric_limits::max(); + for (int j = 2; j <= static_cast (m); j++) { + F2[0][j] = F2[0][j-1] - gf->getHorizontalExtensionPenalty(j); + D2[0][j] = LEFT; + F[0][j] = F1[0][j] = -std::numeric_limits::max(); + } + + for (int i = 1; i <= static_cast (n); i++){ + for (int j = 1; j <= static_cast (m); j++) { + + //F - diag (i-1,j-1) + double s = ss->scoring(i, j); + double m = F[i-1][j-1] + s; + double m1 = F1[i-1][j-1] + s; + double m2 = F2[i-1][j-1] + s; + double mv = max(max(m, m1), m2); + + if(mv == m){ + F[i][j] = mv; + D[i][j] = DIAG; + } + else if(mv == m2){ + F[i][j] = m2; + D[i][j] = LEFT; + } + else if(mv == m1){ + F[i][j] = m1; + D[i][j] = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + //F1 - up (i-1,j) + m = F[i-1][j] - gf->getVerticalOpenPenalty(i); + m1 = F1[i-1][j] - gf->getVerticalExtensionPenalty(i); + mv = max(m, m1); + + if(mv == m){ + F1[i][j] = mv; + D1[i][j] = DIAG; + } + else if(mv == m1){ + F1[i][j] = m1; + D1[i][j] = UP; + } + else{ + ERROR("Error in NWGAlign", exception); + } + + //F2 - left (i,j-1) + m = F[i][j-1] - gf->getHorizontalOpenPenalty(j); + m2 = F2[i][j-1] - gf->getHorizontalExtensionPenalty(j); + mv = max(m, m2); + + if(mv == m){ + F2[i][j] = mv; + D2[i][j] = DIAG; + } + else if(mv == m2){ + F2[i][j] = m2; + D2[i][j] = LEFT; + } + else{ + ERROR("Error in NWGAlign", exception); + } + } + } +} + +double MsaNWG::getScore(){ + return max(max(F[n][m], F1[n][m]), F2[n][m]); +} + +}} diff --git a/Phylogenesis/Sources/MsaNWG.h b/Phylogenesis/Sources/MsaNWG.h new file mode 100644 index 0000000..23f9c7a --- /dev/null +++ b/Phylogenesis/Sources/MsaNWG.h @@ -0,0 +1,69 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef _MSANWG_H_ +#define _MSANWG_H_ + +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis { + +using namespace Victor::Align2; + + /** @brief Implement Needleman-Wunsch-Gotoh global alignment for MSA. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class MsaNWG{ + + public: + + /// Default constructor. + MsaNWG(MsaScoringScheme* ss, MsaGapFunction* gf); + + /// Returns a sequence vector containing a multiple alignment with maximal score. + std::vector getMatch(const vector& vertical, const vector& horizontal); + + /// Returns alignment score. + double getScore(); + + private: + + // ATTRIBUTES + + MsaScoringScheme* ss; + MsaGapFunction* gf; + + vector< vector > F; ///< Gotoh diagonal score matrix. + vector< vector > F1; ///< Gotoh left score matrix. + vector< vector > F2; ///< Gotoh up score matrix. + vector< vector > D; ///< Gotoh diag traceback matrix. + vector< vector > D1; ///< Gotoh left traceback matrix. + vector< vector > D2; ///< Gotoh up traceback matrix. + + unsigned int n; ///< Length of vertical sequences. + unsigned int m; ///< Length of horizontal sequences. + + void calculateMatrix(const vector& vertical, const vector& horizontal); + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/MsaScoringScheme.h b/Phylogenesis/Sources/MsaScoringScheme.h new file mode 100644 index 0000000..3ea5d20 --- /dev/null +++ b/Phylogenesis/Sources/MsaScoringScheme.h @@ -0,0 +1,42 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __MsaScoringScheme_h__ +#define __MsaScoringScheme_h__ + +namespace Victor { namespace Phylogenesis { + + /** @brief Base class for Multiple Sequence Alignment (MSA) scoring functions. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + * + **/ + class MsaScoringScheme{ + public: + + /// Default constructor. + MsaScoringScheme(){} + + /// Destructor. + virtual ~MsaScoringScheme(){} + + /// Scoring function. + virtual double scoring(int i, int j) = 0; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/MultipleAlignment.cc b/Phylogenesis/Sources/MultipleAlignment.cc new file mode 100644 index 0000000..1fd6e5c --- /dev/null +++ b/Phylogenesis/Sources/MultipleAlignment.cc @@ -0,0 +1,69 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include "MultipleAlignment.h" + +namespace Victor { +namespace Phylogenesis { + + MultipleAlignment::MultipleAlignment(vector sequences) : sequences(sequences) { + maxNameLength = 0; + for (unsigned int i = 0; i < sequences.size(); ++i) { + string tName = sequences[i]->name; + tName = tName.substr(0,tName.find(' ')); + if(tName.length() > maxNameLength) + maxNameLength = tName.length(); + } + } + + MultipleAlignment::~MultipleAlignment() { + } + + void MultipleAlignment::saveFasta(string t, string tName, ostream &output) { + output << ">" << tName << "\n"; + for (unsigned int i = 0; i < t.length(); i++) { + if ((i != 0) && ((i % 60) == 0)) + output << "\n"; + output << t[i]; + } + output << "\n"; + } + + void MultipleAlignment::saveClustal(string t, string tName, ostream &output, unsigned int from, unsigned int maxNameLength) { + tName = tName.substr(0,tName.find(' ')); + output << tName; + for (int i = 0; i < (static_cast(maxNameLength) + 10 - static_cast (tName.length())); ++i) + output << " "; + unsigned int max = ((from + 60) < t.length()) ? from + 60 : t.length(); + for (unsigned int i = from; i < max; i++) + output << t[i]; + output << "\n"; + } + + void MultipleAlignment::saveFasta(ostream &output) const { + for (unsigned int j = 0; j < sequences.size(); j++) + saveFasta(sequences[j]->sequence, sequences[j]->name, output); + } + + void MultipleAlignment::saveClustal(ostream &output) const { + for (unsigned int from = 0; from < sequences[0]->sequence.length(); from += 60) { + for (unsigned int j = 0; j < sequences.size(); j++) + saveClustal(sequences[j]->sequence, sequences[j]->name, output, from, maxNameLength); + output << "\n"; + } + } + +}} diff --git a/Phylogenesis/Sources/MultipleAlignment.h b/Phylogenesis/Sources/MultipleAlignment.h new file mode 100644 index 0000000..92498b1 --- /dev/null +++ b/Phylogenesis/Sources/MultipleAlignment.h @@ -0,0 +1,68 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef PHYLOGENESIS_SOURCES_MULTIPLEALIGNMENT_H_ +#define PHYLOGENESIS_SOURCES_MULTIPLEALIGNMENT_H_ + +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Victor::Align2; + +namespace Victor { +namespace Phylogenesis { + + /** @brief Multiple Sequence Alignment result + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class MultipleAlignment { + public: + + /// Default constructor. + MultipleAlignment(vector sequences); + + /// Deconstructor. + virtual ~MultipleAlignment(); + + /// Save single sequence in FASTA format. + static void saveFasta(string t, string tName, ostream &output); + + /// Save as FASTA like output. + virtual void saveFasta(ostream &output) const; + + /// Save single line in CLUSTAL format. + static void saveClustal(string t, string tName, ostream &output, unsigned int from, unsigned int maxNameLength); + + /// Save as CLUSTAL like output. + virtual void saveClustal(ostream &output) const; + + /// Sequences + vector sequences; + + private: + + unsigned int maxNameLength; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/NeighborJoining.cc b/Phylogenesis/Sources/NeighborJoining.cc new file mode 100644 index 0000000..4f1127a --- /dev/null +++ b/Phylogenesis/Sources/NeighborJoining.cc @@ -0,0 +1,188 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** + * + * @param species initial sequences + * @param distanceSpecies distance between sequences + */ + NeighborJoining::NeighborJoining(std::vector species, DistanceMatrix* distanceSpecies) : distanceSpecies(distanceSpecies) + { + for (unsigned int i = 0; i < species.size(); i++) + { + Cluster* cluster = new SingleCluster(species[i]); + (cluster->indices).push_back(species[i]->id); + clusters.push_back(cluster); + } + } + + double NeighborJoining::meanDistance(Cluster* c) + { + double mean = 0.0; + for (unsigned int i = 0; i < clusters.size(); i++) + { + Cluster* current = clusters[i]; + if(current != c) + { + mean+= distanceCluster.at(current,c); + } + } + mean = mean / double((clusters.size() - 2)); + + return mean; + } + + void NeighborJoining::updateMinimalClusterDistance() + { + meanDistances.clear(); + + for (unsigned int i = 0; i < clusters.size(); i++) + { + meanDistances.push_back(meanDistance(clusters[i])); + } + + minimalDistanceCluster.clear(); + + for (unsigned int i = 0; i < clusters.size() - 1; i++) + { + for (unsigned int j = i+1; j < clusters.size(); j++) + { + // M(i,j); + double distance_ij = distanceCluster.at(clusters[i],clusters[j]); + double r_i = meanDistances[i]; + double r_j = meanDistances[j]; + double distance = distance_ij - r_i - r_j; + + minimalDistanceCluster.insert(clusters[i],clusters[j], distance); + } + } + } + + void NeighborJoining::updateDistanceCluster(Cluster* newCluster) + { + Cluster* a = newCluster->branches[0]; + Cluster* b = newCluster->branches[1]; + + //Note: a and b was removed from clusters list + for (unsigned int i = 0; i < clusters.size(); i++) + { + Cluster* c = clusters[i]; + double distance = (distanceCluster.at(a,c) + distanceCluster.at(b,c) - distanceCluster.at(a,b)) / 2.0; + distanceCluster.insert(newCluster, c, distance); + } + } + + Cluster* NeighborJoining::performClustering() + { + std::cout<<"\nPerforming NJ clustering...\n"; + + //Init distanceCluster + for (unsigned int i = 0; i < clusters.size() - 1; i++) + { + for (unsigned int j = i+1; j < clusters.size(); j++) + { + distanceCluster.insert(clusters[i],clusters[j], distanceSpecies->at(i,j)); + } + } + + if(clusters.size() == 1){ + std::cout<<"done\n"; + return clusters[0]; + } + + while(clusters.size() > 2) + { + updateMinimalClusterDistance(); + + // Find the two clusters to 'smaller' distance according to NJ (minimalDistanceCluster) + double minDistance = std::numeric_limits::max(); + Cluster* firstToMerge = NULL; + Cluster* secondToMerge = NULL; + int firstIndex = -1; + int secondIndex = -2; + + for (unsigned int i = 0; i < clusters.size() - 1; i++) + { + for (unsigned int j = i+1; j < clusters.size(); j++) + { + Cluster* first = clusters[i]; + Cluster* second = clusters[j]; + // NJ distance + double pairDistance = minimalDistanceCluster.at(first, second); + + if(pairDistance < minDistance){ + minDistance = pairDistance; + firstToMerge = first; + secondToMerge = second; + firstIndex = i; + secondIndex = j; + } + } + } + + // Build the new cluster + Cluster* merged = new Cluster(); + merged->branches.push_back(firstToMerge); + merged->branches.push_back(secondToMerge); + + // Update species in the current cluster + std::vector ms = std::vector(firstToMerge->indices); + ms.insert(ms.end(),secondToMerge->indices.begin(),secondToMerge->indices.end()); + merged->indices = ms; + + //branchLength a,merged = d(A,B) / 2 + [r(A)-r(B)] / 2 + //branchLength b,merged = d(A,B) / 2 + [r(B)-r(A)] / 2 + double lenghtToFirst = (distanceCluster.at(firstToMerge,secondToMerge) / 2.0) + ((meanDistances[firstIndex] - meanDistances[secondIndex])/2.0); + double lenghtToSecond = distanceCluster.at(firstToMerge,secondToMerge) - lenghtToFirst; + merged->lengths.push_back(lenghtToFirst); + merged->lengths.push_back(lenghtToSecond); + + // Remove merged clusters from the initial list + std::vector clustersUpdated; + for (unsigned int i = 0; i < clusters.size(); i++) + { + if(clusters[i] == firstToMerge || clusters[i] == secondToMerge) + continue; + + clustersUpdated.push_back(clusters[i]); + } + clusters = clustersUpdated; + + // Update distanceCluster + updateDistanceCluster(merged); + + // Remove merged data distances + distanceCluster.erase(firstToMerge, secondToMerge); + + // Add the new cluster + clusters.push_back(merged); + } + + // Merge last 2 clusters with an edge + clusters[1]->branches.push_back(clusters[0]); + clusters[1]->lengths.push_back(distanceCluster.at(clusters[0],clusters[1])); + + return new Cluster(*clusters[1]); + } + +}} diff --git a/Phylogenesis/Sources/NeighborJoining.h b/Phylogenesis/Sources/NeighborJoining.h new file mode 100644 index 0000000..3fda8b5 --- /dev/null +++ b/Phylogenesis/Sources/NeighborJoining.h @@ -0,0 +1,66 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __NeighborJoining_H__ +#define __NeighborJoining_H__ + +#include +#include +#include +#include +#include "Sequence.h" + +namespace Victor { namespace Phylogenesis{ + + /** @brief Hierarchical clustering by NeighborJoining. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class NeighborJoining : public HierarchicalClusterBuilder + { + public: + /// Constructor. + NeighborJoining(std::vector species, DistanceMatrix* distanceSpecies); + + /// Perform the hierarchical clustering with NJ algorithm. + Cluster* performClustering(); + + private: + // ATTRIBUTES: + + /// Initial DistanceMatrix for given sequences + DistanceMatrix* distanceSpecies; + + /// NJ MinimalDistanceCluster + ClusterDistanceMatrix minimalDistanceCluster; + + /// ClusterDistanceMatrix + ClusterDistanceMatrix distanceCluster; + + /// Clusters still not clustered + std::vector clusters; + + /// Mean distance between other clusters + std::vector meanDistances; + + void updateMinimalClusterDistance(); + void updateDistanceCluster(Cluster* newCluster); + double meanDistance(Cluster* c); + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/NewickPhylogeneticTreeExporter.cc b/Phylogenesis/Sources/NewickPhylogeneticTreeExporter.cc new file mode 100644 index 0000000..e247824 --- /dev/null +++ b/Phylogenesis/Sources/NewickPhylogeneticTreeExporter.cc @@ -0,0 +1,95 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + //g++ bug? http://stackoverflow.com/questions/19122574/to-string-isnt-a-member-of-std + template + string to_string(T val) + { + stringstream stream; + stream << val; + return stream.str(); + } + + NewickPhylogeneticTreeExporter::NewickPhylogeneticTreeExporter(){ + } + + NewickPhylogeneticTreeExporter::NewickPhylogeneticTreeExporter(std::string fileName) : fileName(fileName) { + } + + std::string NewickPhylogeneticTreeExporter::buildRec(Cluster* cluster, double distanceFromFather) { + + std::stringstream result; + + // leaf node A:dist + if(cluster->isSingle()) + { + string name = static_cast(cluster)->sequence->name; + name = name.substr(0,name.find(' ')); + return name + ":" + to_string(distanceFromFather); + } + + // internal node (..,..,..):distanceFromFather - current + result << "("; + for(unsigned int i = 0; i < cluster->branches.size(); i++) + { + if(i>0) + result << ","; + result << buildRec(cluster->branches[i], cluster->lengths[i]); + } + result << ")"; + + // root node + if(distanceFromFather == 0) + { + result<<";"; + return result.str(); + } + + result<<":"<< to_string(distanceFromFather); + return result.str(); + } + + void NewickPhylogeneticTreeExporter::save(Cluster* tree){ + std::string result = buildRec(tree, 0.0); + + if(!fileName.empty()) + { + std::cout<<"\nWriting newick file "<. +*/ + +#ifndef __NewickTreeBuilder_H__ +#define __NewickTreeBuilder_H__ + +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief Clustering tree exporter in the Newick tree format. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class NewickPhylogeneticTreeExporter : public PhylogeneticTreeExporter + { + public: + /// Constructor. + NewickPhylogeneticTreeExporter(); + + /// Default constructor. + NewickPhylogeneticTreeExporter(std::string fileName); + + /// Export the clustering tree in the Newick tree format + void save(Cluster* tree); + + /// Export the clustering tree in the Newick tree format to output + void save(Cluster* tree, ostream &output); + + private: + + /// Recursive call to build the clustering tree + std::string buildRec(Cluster* cluster, double fatherDistance); + + /// Output filename + std::string fileName; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/PhylogeneticTreeExporter.h b/Phylogenesis/Sources/PhylogeneticTreeExporter.h new file mode 100644 index 0000000..0324131 --- /dev/null +++ b/Phylogenesis/Sources/PhylogeneticTreeExporter.h @@ -0,0 +1,40 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __PhylogeneticTreeExporter_H__ +#define __PhylogeneticTreeExporter_H__ + +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief Abstract class for clustering tree exporter. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class PhylogeneticTreeExporter + { + public: + /// Destructor. + virtual ~PhylogeneticTreeExporter(){} + + /// Export the clustering tree + virtual void save(Cluster* tree) = 0; + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/Sequence.cc b/Phylogenesis/Sources/Sequence.cc new file mode 100644 index 0000000..8524f52 --- /dev/null +++ b/Phylogenesis/Sources/Sequence.cc @@ -0,0 +1,119 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include + +namespace Victor { namespace Phylogenesis{ + +static unsigned int HYDROPHILIC_STRETCH_SIZE = 5; + +Sequence::Sequence(){ + id = -1; + divergent = false; +} + +/** +* +* @param name of the RawSequence +* @param seq of the RawSequence +* @param id of the RawSequence +*/ +Sequence::Sequence(std::string name, std::string seq, int id): name(name), sequence(seq), id(id), divergent(false){ +} + +bool Sequence::isGap(int pos){ + if(pos < 0 || pos >= (int)sequence.size()) + return false; + return sequence[pos] == '-'; +} + +bool Sequence::isHydrophilic(int pos){ + if( sequence[pos] == 'D' || + sequence[pos] == 'E' || + sequence[pos] == 'G' || + sequence[pos] == 'K' || + sequence[pos] == 'N' || + sequence[pos] == 'Q' || + sequence[pos] == 'P' || + sequence[pos] == 'R' || + sequence[pos] == 'S'){ + return true; + } + return false; +} + +bool Sequence::isHydrophilicStretch(int pos){ + if(pos < 0 || pos >= (int)sequence.size()) + return false; + + bool goOn = true; + for(unsigned int i = pos, j = 0; i Sequence::LoadFasta(std::ifstream& inputFile){ + std::vector seqs; + int id=0; + std::string line, name, content; + while(std::getline( inputFile, line ).good()){ + if( line.empty() || line[0] == '>' ){ + if( !name.empty() ){ + seqs.push_back(new Sequence(name, GetPureSequence(content), id++)); + name.clear(); + } + if( !line.empty() ){ + name = line.substr(1); + } + content.clear(); + } else if( !name.empty() ){ + if( line.find(' ') != std::string::npos ){ + name.clear(); + content.clear(); + } else { + content += line; + } + } + } + if( !name.empty() ){ + seqs.push_back(new Sequence(name, content, id++)); + } + return seqs; +} + +double Sequence::GetAvgLength(std::vector seq){ + double length = 0; + for(unsigned int i=0;isequence).size(); + } + return length / seq.size(); +} + +}} diff --git a/Phylogenesis/Sources/Sequence.h b/Phylogenesis/Sources/Sequence.h new file mode 100644 index 0000000..f6dbae5 --- /dev/null +++ b/Phylogenesis/Sources/Sequence.h @@ -0,0 +1,76 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + +#ifndef __Sequence_H__ +#define __Sequence_H__ + +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** @brief It represents a generic sequence with name. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class Sequence{ + public: + /// Default constructor. + Sequence(); + + /// Constructor. + Sequence(std::string name, std::string sequence, int id); + + /// Check if there is a gap at position + bool isGap(int pos); + + /// Check if amino acid at pos is hydrophilic + bool isHydrophilic(int pos); + + /// Check if there is an hydrophilic stretch at position + bool isHydrophilicStretch(int pos); + + /// Read FASTA format input. + static std::vector LoadFasta(std::ifstream& inputFile); + + /// Get sequence without gaps + static std::string GetPureSequence(const std::string &s); + + /// Get average length of the sequence + static double GetAvgLength(std::vector seq); + + // ATTRIBUTES: + + /// Sequence's name + std::string name; + /// Generic sequence + std::string sequence; + /// Id + int id; + /// Sequence weight + double weight; + /// Normalized sequence weight + double normalizedWeight; + /// Is divergent + bool divergent; + + + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/SingleCluster.h b/Phylogenesis/Sources/SingleCluster.h new file mode 100644 index 0000000..62b2116 --- /dev/null +++ b/Phylogenesis/Sources/SingleCluster.h @@ -0,0 +1,54 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __SingleCluster_H__ +#define __SingleCluster_H__ + +#include +#include +#include "Sequence.h" + +namespace Victor { namespace Phylogenesis{ + + /** @brief It represents a leaf of the clustering tree containing a Sequence object. + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class SingleCluster : public Cluster{ + public: + /// Constructor. + SingleCluster(Sequence* sequence); + + /// Returns if is a single cluster (true) + virtual bool isSingle(); + + // ATTRIBUTES: + Sequence* sequence; + }; + + inline SingleCluster::SingleCluster(Sequence* sequence) : sequence(sequence) + { + distance = 0.0; + } + + inline bool SingleCluster::isSingle() + { + return true; + } + +}} + +#endif diff --git a/Phylogenesis/Sources/SubMatrixFactory.cc b/Phylogenesis/Sources/SubMatrixFactory.cc new file mode 100644 index 0000000..9f175d5 --- /dev/null +++ b/Phylogenesis/Sources/SubMatrixFactory.cc @@ -0,0 +1,137 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include + +namespace Victor { +namespace Phylogenesis { + +SubMatrixFactory::SubMatrixFactory(bool positive) { + + string path = getenv("VICTOR_ROOT"); + path += + "data/"; + string matrixFileName; + + matrixFileName = path + "blosum30.dat"; + ifstream b30(matrixFileName.c_str()); + blosum30 = new SubMatrix(b30); + + matrixFileName = path + "blosum45.dat"; + ifstream b45(matrixFileName.c_str()); + blosum45 = new SubMatrix(b45); + + matrixFileName = path + "blosum62.dat"; + ifstream b62(matrixFileName.c_str()); + blosum62 = new SubMatrix(b62); + + matrixFileName = path + "blosum80.dat"; + ifstream b80(matrixFileName.c_str()); + blosum80 = new SubMatrix(b80); + + matrixFileName = path + "pam20.dat"; + ifstream p20(matrixFileName.c_str()); + pam20 = new SubMatrix(p20); + + matrixFileName = path + "pam60.dat"; + ifstream p60(matrixFileName.c_str()); + pam60 = new SubMatrix(p60); + + matrixFileName = path + "pam120.dat"; + ifstream p120(matrixFileName.c_str()); + pam120 = new SubMatrix(p120); + + matrixFileName = path + "pam350.dat"; + ifstream p350(matrixFileName.c_str()); + pam350 = new SubMatrix(p350); + + if(positive){ + rescore(blosum30); + rescore(blosum45); + rescore(blosum62, 2.0); + rescore(blosum80); + rescore(pam20); + rescore(pam60); + rescore(pam120); + rescore(pam350); + } +} + +void SubMatrixFactory::rescore(SubMatrix* matrix, double scalar){ + string residues = matrix->getResidues(); + double res = residues.size()-2; //avoids '-' + for(int i=0;i<=res;i++){ + char res1 = residues[i]; + char res1_bis = res1; + if(res1>=65 && res1<=90){ + res1_bis = res1 + 32; + } + if(res1>=97 && res1<=122){ + res1_bis = res1 - 32; + } + for(int j=0;j<=i;j++){ + char res2 = residues[j]; + char res2_bis = res2; + if(res2>=65 && res2<=90){ + res2_bis = res2 + 32; + } + if(res2>=97 && res2<=122){ + res2_bis = res2 - 32; + } + double rescoredValue = matrix->score[res1][res2] - matrix->getMinScore(); + matrix->score[res1][res2] = matrix->score[res2][res1] = + matrix->score[res1][res2_bis] = matrix->score[res2_bis][res1] = + matrix->score[res1_bis][res2] = matrix->score[res2][res1_bis] = + matrix->score[res1_bis][res2_bis] = matrix->score[res2_bis][res1_bis] = rescoredValue * scalar; + } + } +} + +SubMatrixFactory::~SubMatrixFactory() { + delete blosum30; + delete blosum45; + delete blosum62; + delete blosum80; + delete pam20; + delete pam60; + delete pam120; + delete pam350; +} + +SubMatrix* SubMatrixFactory::getSubMatrix(SubMatrixName name){ + switch(name){ + case(BLOSUM30): + return blosum30; + case(BLOSUM45): + return blosum45; + case(BLOSUM62): + return blosum62; + case(BLOSUM80): + return blosum80; + case(PAM20): + return pam20; + case(PAM60): + return pam60; + case(PAM120): + return pam120; + case(PAM350): + return pam350; + } + return NULL; +} + +}} diff --git a/Phylogenesis/Sources/SubMatrixFactory.h b/Phylogenesis/Sources/SubMatrixFactory.h new file mode 100644 index 0000000..4ccca3c --- /dev/null +++ b/Phylogenesis/Sources/SubMatrixFactory.h @@ -0,0 +1,73 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __SubMatrixFactory_h__ +#define __SubMatrixFactory_h__ + +#include + +namespace Victor { namespace Phylogenesis { + + using namespace Victor::Align2; + + enum SubMatrixFamily{ + BLOSUM, + PAM + }; + + enum SubMatrixName { + BLOSUM30, + BLOSUM45, + BLOSUM62, + BLOSUM80, + PAM20, + PAM60, + PAM120, + PAM350 + }; + + /** @brief Implement a factory for SumMatrix objects + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class SubMatrixFactory { + private: + SubMatrix* blosum30; + SubMatrix* blosum45; + SubMatrix* blosum62; + SubMatrix* blosum80; + SubMatrix* pam20; + SubMatrix* pam60; + SubMatrix* pam120; + SubMatrix* pam350; + public: + + /// Default constructor. + SubMatrixFactory(bool positive); + + /// Deconstructor. + virtual ~SubMatrixFactory(); + + /// Returns the SubMatrix by name. + SubMatrix* getSubMatrix(SubMatrixName name); + + /// Rescore the SubMatrix with positive values and multiplied by the scalar. + void rescore(SubMatrix* matrix, double scalar = 1.0); + }; + +}} + +#endif diff --git a/Phylogenesis/Sources/UPGMA.cc b/Phylogenesis/Sources/UPGMA.cc new file mode 100644 index 0000000..26896dd --- /dev/null +++ b/Phylogenesis/Sources/UPGMA.cc @@ -0,0 +1,146 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include + +namespace Victor { namespace Phylogenesis{ + + /** + * + * @param species initial sequences + * @param distanceSpecies distance between sequences + */ + UPGMA::UPGMA(std::vector species, DistanceMatrix* distanceSpecies) : distanceSpecies(distanceSpecies) + { + for (unsigned int i = 0; i < species.size(); i++) + { + Cluster* cluster = new SingleCluster(species[i]); + (cluster->indices).push_back(species[i]->id); + clusters.push_back(cluster); + } + } + + Cluster* UPGMA::performClustering() + { + //Init distanceCluster + for (unsigned int i = 0; i < clusters.size() - 1; i++) + { + for (unsigned int j = i+1; j < clusters.size(); j++) + { + distanceCluster.insert(clusters[i],clusters[j], distanceSpecies->at(i,j)); + } + } + + std::cout<<"\nPerforming UPGMA clustering\n"; + + while(clusters.size() != 1) + { + // Find the two clusters at minimum distance + double minDistance = std::numeric_limits::max(); + Cluster* firstToMerge = NULL; + Cluster* secondToMerge = NULL; + + for (unsigned int i = 0; i < clusters.size() - 1; i++) + { + for (unsigned int j = i+1; j < clusters.size(); j++) + { + Cluster* first = clusters[i]; + Cluster* second = clusters[j]; + double pairDistance = distanceCluster.at(first, second); + + if(pairDistance < minDistance){ + minDistance = pairDistance; + firstToMerge = first; + secondToMerge = second; + } + } + } + + // Build the new cluster + Cluster* merged = new Cluster(); + merged->branches.push_back(firstToMerge); + merged->branches.push_back(secondToMerge); + + // Current cluster distance + double distance = minDistance / 2.0; + merged->distance = distance; + + // Distance to clusters + merged->lengths.push_back(distance - firstToMerge->distance); + merged->lengths.push_back(distance - secondToMerge->distance); + + // Update species in the current cluster + std::vector ms = std::vector(firstToMerge->indices); + ms.insert(ms.end(),secondToMerge->indices.begin(),secondToMerge->indices.end()); + merged->indices = ms; + + // Remove merged data distances + distanceCluster.erase(firstToMerge, secondToMerge); + + // Remove merged clusters from the initial list + std::vector clustersUpdated; + for (unsigned int i = 0; i < clusters.size(); i++) + { + if(clusters[i] == firstToMerge || clusters[i] == secondToMerge) + continue; + + clustersUpdated.push_back(clusters[i]); + } + clusters = clustersUpdated; + + // Add new distance from other clusters + updateClusterDistanceMatrix(merged); + + // Add the new cluster + //clusters.push_back(merged); + //To keep original ClustalW order + clusters.insert(clusters.begin(), merged); + } + + Cluster* root = clusters.front(); + root->setRooted(true); + return root; + } + + void UPGMA::updateClusterDistanceMatrix(Cluster* newCluster) + { + for (unsigned int i = 0; i < clusters.size(); i++) + { + Cluster* c = clusters[i]; + + // Calculates the distance based on the present species + double distance = 0.0; + std::vector leftS(newCluster->indices); + std::vector rightS(c->indices); + + for (unsigned int ii = 0; ii < leftS.size(); ii++) + { + for (unsigned int jj = 0; jj < rightS.size(); jj++) + { + distance += distanceSpecies->at(leftS[ii],rightS[jj]); + } + } + distance = distance / (leftS.size() * rightS.size()); + + // Add it + distanceCluster.insert(c, newCluster, distance); + } + } + +}} diff --git a/Phylogenesis/Sources/UPGMA.h b/Phylogenesis/Sources/UPGMA.h new file mode 100644 index 0000000..c77ae5d --- /dev/null +++ b/Phylogenesis/Sources/UPGMA.h @@ -0,0 +1,60 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#ifndef __UPGMA_H_ +#define __UPGMA_H_ + +#include +#include +#include +#include +#include +#include "Sequence.h" + +namespace Victor { namespace Phylogenesis{ + + /** @brief Hierarchical clustering by UPGMA (Unweighted Pair Group Method with Arithmetic Mean). + * @author Andrea Nasari (andrea.nasari\@studenti.unipd.it) + * + **/ + class UPGMA : public HierarchicalClusterBuilder{ + public: + /// Constructor. + UPGMA(std::vector species, DistanceMatrix* distanceSpecies); + + /// Perform the hierarchical clustering with UPGMA algorithm. + Cluster* performClustering(); + + private: + + /// Update the ClusterDistanceMatrix with the new Cluster + void updateClusterDistanceMatrix(Cluster* newCluster); + + // ATTRIBUTES: + + /// Initial DistanceMatrix for given sequences + DistanceMatrix* distanceSpecies; + + /// Clusters still not clustered + std::vector clusters; + + /// ClusterDistanceMatrix + ClusterDistanceMatrix distanceCluster; + }; + +}} + +#endif diff --git a/Phylogenesis/Tests/Makefile b/Phylogenesis/Tests/Makefile new file mode 100644 index 0000000..7ee4a51 --- /dev/null +++ b/Phylogenesis/Tests/Makefile @@ -0,0 +1,57 @@ +#--*- makefile -*-------------------------------------------------------------- +# +# Standard makefile +# +#------------------------------------------------------------------------------ + +# Path to project directory. +UPDIR = ../.. +# Path to subdirectories. +SUBDIR= +# Path to directory for binaries: +BINPATH = ../../bin + + +# +# Libraries and paths (which are not defined globally). +# + +LIBS = -lPhylogenesis -lAlign2 -lBiopool -ltools -L/usr/lib/ -lm -ldl -lcppunit + +LIB_PATH = -L. + +INC_PATH = -I. -I ../../Biopool/ -I../../tools/ -I../../Align2/Sources -I../../Phylogenesis/Sources + +# +# Objects and headers +# + +SOURCES = TestPhylogenesis.cc TestDistanceMatrix.h \ + TestClusterDistanceMatrix.h TestUPGMA.h TestNJ.h \ + TestNewickPhylogeneticTreeExporter.h TestClustalw.h + +OBJECTS = $(SOURCES:.cpp=.o) + +TARGETS = TestPhylogenesis + +EXECS = TestPhylogenesis + +LIBRARY = TESTlibPhylogenesis.a + +# +# Install rule +# + +compile: all + +all: install + +install: $(LIBRARY) $(TARGETS) + mv $(EXECS) $(UPDIR)/bin + mv $(LIBRARY) $(UPDIR)/lib + +# +# Call global Makefile to do the job. +# + +include ../../Makefile.global diff --git a/Phylogenesis/Tests/TestClustalw.h b/Phylogenesis/Tests/TestClustalw.h new file mode 100644 index 0000000..84503e0 --- /dev/null +++ b/Phylogenesis/Tests/TestClustalw.h @@ -0,0 +1,86 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestClustalw : public CppUnit::TestFixture { +private: + DistanceMatrix* dm; + ClustalW* sut; +public: + + TestClustalw(){ + string path = getenv("VICTOR_ROOT"); + string distanceMatrix = path + "Phylogenesis/Tests/data/distanceMatrix.dat"; + dm = new DistanceMatrix(distanceMatrix); + } + + virtual ~TestClustalw(){ + } + + void setUp(){ + } + + void tearDown() { + } + + void test_msa(){ + + Sequence* s1 = new Sequence("A","PEEKSAVTALWGKVNVDEYGG", 0); + Sequence* s2 = new Sequence("B","GEEKAAVLALWDKVNEEEYGG", 1); + Sequence* s3 = new Sequence("C","PADKTNVKAAWGKVGAHAGEYGA", 2); + vector sequences; + sequences.push_back(s1); + sequences.push_back(s2); + sequences.push_back(s3); + + UPGMA* builder = new UPGMA(sequences, dm); + ClustalW* sut = new ClustalW(dm, builder, sequences, 0.4, 10, 0.2, false, false); + + MultipleAlignment* result = sut->performMultipleSequenceAlignment(); + + CPPUNIT_ASSERT(result->sequences[0]->sequence == "PEEKSAVTALWGKV--NVDEYGG"); + CPPUNIT_ASSERT(result->sequences[1]->sequence == "GEEKAAVLALWDKV--NEEEYGG"); + CPPUNIT_ASSERT(result->sequences[2]->sequence == "PADKTNVKAAWGKVGAHAGEYGA"); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestClustalw"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - MSA.", + &TestClustalw::test_msa)); + + return suiteOfTests; + } + +}; diff --git a/Phylogenesis/Tests/TestClusterDistanceMatrix.h b/Phylogenesis/Tests/TestClusterDistanceMatrix.h new file mode 100644 index 0000000..242c8cc --- /dev/null +++ b/Phylogenesis/Tests/TestClusterDistanceMatrix.h @@ -0,0 +1,94 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +//To avoid "Debug.h" definition +#undef exception +#include + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestClusterDistanceMatrix : public CppUnit::TestFixture { +private: + ClusterDistanceMatrix* sut; +public: + + virtual ~TestClusterDistanceMatrix() { + } + + void setUp() { + } + + void tearDown() { + } + + void test_shouldInsertAndRetrieve() { + Cluster* c1 = new Cluster(); + Cluster* c2 = new Cluster(); + + sut = new ClusterDistanceMatrix(); + sut->insert(c1, c2, 0.10); + + CPPUNIT_ASSERT(sut->at(c1,c2) == sut->at(c2,c1)); + CPPUNIT_ASSERT(sut->at(c1,c2) == 0.10); + } + + void test_shouldEraseCorrectly() { + Cluster* c1 = new Cluster(); + Cluster* c2 = new Cluster(); + Cluster* c3 = new Cluster(); + Cluster* c4 = new Cluster(); + + sut = new ClusterDistanceMatrix(); + sut->insert(c1, c2, 0.10); + sut->insert(c1, c3, 0.20); + sut->insert(c2, c3, 0.30); + sut->insert(c2, c4, 0.40); + sut->insert(c3, c4, 0.50); + + sut->erase(c1, c4); + + CPPUNIT_ASSERT(sut->at(c2,c3) == 0.30); + CPPUNIT_ASSERT_THROW(sut->at(c1,c2), std::out_of_range); + CPPUNIT_ASSERT_THROW(sut->at(c1,c3), std::out_of_range); + CPPUNIT_ASSERT_THROW(sut->at(c2,c4), std::out_of_range); + CPPUNIT_ASSERT_THROW(sut->at(c3,c4), std::out_of_range); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestClusterDistanceMatrix"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - ShouldInsertAndRetrieve.", + &TestClusterDistanceMatrix::test_shouldInsertAndRetrieve)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test2 - ShouldEraseCorrectly.", + &TestClusterDistanceMatrix::test_shouldEraseCorrectly)); + + return suiteOfTests; + } + + +}; diff --git a/Phylogenesis/Tests/TestDistanceMatrix.h b/Phylogenesis/Tests/TestDistanceMatrix.h new file mode 100644 index 0000000..9d4e415 --- /dev/null +++ b/Phylogenesis/Tests/TestDistanceMatrix.h @@ -0,0 +1,72 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestDistanceMatrix : public CppUnit::TestFixture { + + private: + string dataPath; + DistanceMatrix* sut; + + public: + + TestDistanceMatrix(){ + string path = getenv("VICTOR_ROOT"); + dataPath = path + "Phylogenesis/Tests/data/"; + sut = NULL; + } + + virtual ~TestDistanceMatrix() { + } + + void setUp() { + } + + void tearDown() { + } + + void test_shouldLoadDistanceMatrixFromFile() { + sut = new DistanceMatrix(dataPath + "distanceMatrix.dat"); + + CPPUNIT_ASSERT(sut->at(0,1) == sut->at(1,0)); + CPPUNIT_ASSERT(sut->at(0,1) == 0.10); + CPPUNIT_ASSERT(sut->at(0,2) == sut->at(2,0)); + CPPUNIT_ASSERT(sut->at(0,2) == 0.20); + CPPUNIT_ASSERT(sut->at(1,2) == sut->at(2,1)); + CPPUNIT_ASSERT(sut->at(1,2) == 0.30); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestDistanceMatrix"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - ShouldLoadDistanceMatrixFromFile.", + &TestDistanceMatrix::test_shouldLoadDistanceMatrixFromFile)); + + return suiteOfTests; + } +}; diff --git a/Phylogenesis/Tests/TestNJ.h b/Phylogenesis/Tests/TestNJ.h new file mode 100644 index 0000000..de48d06 --- /dev/null +++ b/Phylogenesis/Tests/TestNJ.h @@ -0,0 +1,124 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestNJ : public CppUnit::TestFixture { +private: + NeighborJoining* sut; + +public: + + virtual ~TestNJ() { + } + + void setUp() { + } + + void tearDown() { + } + + void test_ShouldClusteringOneSpecies() { + vector species; + species.push_back(new Sequence("s1","AAA",0)); + + sut = new NeighborJoining(species, NULL); + Cluster* result = sut->performClustering(); + + CPPUNIT_ASSERT(result->branches.size() == 0); + CPPUNIT_ASSERT(result->distance == 0); + CPPUNIT_ASSERT(result->isSingle() == true); + } + + void test_ShouldClusteringTwoSpecies() { + + DistanceMatrix* dm = new DistanceMatrix(); + dm->appendDistance(0,100); + + vector species; + species.push_back(new Sequence("s1","AAA",0)); + species.push_back(new Sequence("s2","CCC",1)); + + sut = new NeighborJoining(species, dm); + Cluster* result = sut->performClustering(); + + CPPUNIT_ASSERT(result->branches.size() == 1); + CPPUNIT_ASSERT(result->isSingle() == false); + CPPUNIT_ASSERT(result->branches[0]->isSingle() == true); + CPPUNIT_ASSERT(result->lengths[0] == 100); + } + + void test_ShouldClusteringThreeSpecies() { + + DistanceMatrix* dm = new DistanceMatrix(); + dm->appendDistance(0,100); + dm->appendDistance(0,200); + dm->appendDistance(1,300); + //0 1 100 + //0 2 200 + //1 2 300 + + vector species; + species.push_back(new Sequence("s1","AAA",0)); + species.push_back(new Sequence("s2","CCC",1)); + species.push_back(new Sequence("s3","DDD",2)); + + sut = new NeighborJoining(species, dm); + Cluster* result = sut->performClustering(); + + // merge s1 - s3, then s2 + + CPPUNIT_ASSERT(result->isSingle() == false); + CPPUNIT_ASSERT(result->branches.size() == 3); + CPPUNIT_ASSERT(result->lengths[0] == 0); + CPPUNIT_ASSERT(result->lengths[1] == 100); + CPPUNIT_ASSERT(result->lengths[2] == 200); + CPPUNIT_ASSERT(result->branches[0]->isSingle() == true); + CPPUNIT_ASSERT(result->branches[1]->isSingle() == true); + CPPUNIT_ASSERT(result->branches[2]->isSingle() == true); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestNJ"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - ShouldClusteringOneSpecies.", + &TestNJ::test_ShouldClusteringOneSpecies)); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test2 - ShouldClusteringTwoSpecies.", + &TestNJ::test_ShouldClusteringTwoSpecies)); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test3 - ShouldClusteringThreeSpecies.", + &TestNJ::test_ShouldClusteringThreeSpecies)); + + return suiteOfTests; + } + +}; diff --git a/Phylogenesis/Tests/TestNewickPhylogeneticTreeExporter.h b/Phylogenesis/Tests/TestNewickPhylogeneticTreeExporter.h new file mode 100644 index 0000000..d94003b --- /dev/null +++ b/Phylogenesis/Tests/TestNewickPhylogeneticTreeExporter.h @@ -0,0 +1,125 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestNewickPhylogeneticTreeExporter : public CppUnit::TestFixture { +private: + NewickPhylogeneticTreeExporter* sut; + +public: + + TestNewickPhylogeneticTreeExporter(){ + sut = new NewickPhylogeneticTreeExporter(); + } + + virtual ~TestNewickPhylogeneticTreeExporter() { + } + + void test_ShouldPrintNewickTree1(){ + + Sequence* s = new Sequence("A","xxx",0); + Cluster* tree = new SingleCluster(s); + + string expectation = "A:0"; + std::ostringstream stream; + sut->save(tree, stream); + + CPPUNIT_ASSERT(expectation == stream.str()); + } + + void test_ShouldPrintNewickTree2(){ + + Sequence* s1 = new Sequence("A","xxx",0); + Sequence* s2 = new Sequence("B","xxx",1); + Cluster* leaf1 = new SingleCluster(s1); + Cluster* leaf2 = new SingleCluster(s2); + + Cluster* tree = new Cluster(); + tree->branches.push_back(leaf1); + tree->branches.push_back(leaf2); + tree->lengths.push_back(10); + tree->lengths.push_back(20); + + string expectation = "(A:10,B:20);"; + std::ostringstream stream; + sut->save(tree, stream); + + CPPUNIT_ASSERT(expectation == stream.str()); + } + + void test_ShouldPrintNewickTree3(){ + Sequence* s1 = new Sequence("A","xxx",0); + Sequence* s2 = new Sequence("B","xxx",1); + Cluster* leaf1 = new SingleCluster(s1); + Cluster* leaf2 = new SingleCluster(s2); + Cluster* lleaf1 = new Cluster(); + lleaf1->branches.push_back(leaf1); + lleaf1->branches.push_back(leaf2); + lleaf1->lengths.push_back(10); + lleaf1->lengths.push_back(20); + Sequence* s3 = new Sequence("C","xxx",2); + Cluster* lleaf2 = new SingleCluster(s3); + + Cluster* tree = new Cluster(); + tree->branches.push_back(lleaf1); + tree->branches.push_back(lleaf2); + tree->lengths.push_back(30); + tree->lengths.push_back(40); + + string expectation = "((A:10,B:20):30,C:40);"; + std::ostringstream stream; + sut->save(tree, stream); + + CPPUNIT_ASSERT(expectation == stream.str()); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestNewickPhylogeneticTreeExporter"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - ShouldPrintNewickTree1.", + &TestNewickPhylogeneticTreeExporter::test_ShouldPrintNewickTree1)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test2 - ShouldPrintNewickTree2.", + &TestNewickPhylogeneticTreeExporter::test_ShouldPrintNewickTree2)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test3 - ShouldPrintNewickTree3.", + &TestNewickPhylogeneticTreeExporter::test_ShouldPrintNewickTree3)); + + return suiteOfTests; + } + + /// Setup method + + void setUp() { + } + + void tearDown() { + } + +}; diff --git a/Phylogenesis/Tests/TestPhylogenesis.cc b/Phylogenesis/Tests/TestPhylogenesis.cc new file mode 100644 index 0000000..9b3ac0b --- /dev/null +++ b/Phylogenesis/Tests/TestPhylogenesis.cc @@ -0,0 +1,45 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Victor; + +int main() { + CppUnit::TextUi::TestRunner runner; + + cout << "Creating Test Suites:" << endl; + runner.addTest(TestDistanceMatrix::suite()); + runner.addTest(TestClusterDistanceMatrix::suite()); + runner.addTest(TestUPGMA::suite()); + runner.addTest(TestNJ::suite()); + runner.addTest(TestNewickPhylogeneticTreeExporter::suite()); + runner.addTest(TestClustalw::suite()); + + cout<< "Running the unit tests."<. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../Sources/Sequence.h" + +using namespace std; +using namespace Victor; +using namespace Victor::Phylogenesis; + +class TestUPGMA : public CppUnit::TestFixture { +private: + UPGMA* sut; + +public: + + virtual ~TestUPGMA() { + } + + void setUp() { + } + + void tearDown() { + } + + void test_ShouldClusteringOneSpecies() { + vector species; + species.push_back(new Sequence("s1","AAA", 0)); + + sut = new UPGMA(species, NULL); + Cluster* result = sut->performClustering(); + + CPPUNIT_ASSERT(result->branches.size() == 0); + CPPUNIT_ASSERT(result->distance == 0); + CPPUNIT_ASSERT(result->isSingle() == true); + } + + void test_ShouldClusteringTwoSpecies() { + + DistanceMatrix* dm = new DistanceMatrix(); + dm->appendDistance(0,100); + + vector species; + species.push_back(new Sequence("s1","AAA", 0)); + species.push_back(new Sequence("s2","CCC", 1)); + + sut = new UPGMA(species, dm); + Cluster* result = sut->performClustering(); + + CPPUNIT_ASSERT(result->branches.size() == 2); + CPPUNIT_ASSERT(result->distance == 50); + CPPUNIT_ASSERT(result->isSingle() == false); + CPPUNIT_ASSERT(result->branches[0]->isSingle() == true); + CPPUNIT_ASSERT(result->branches[1]->isSingle() == true); + CPPUNIT_ASSERT(result->lengths[0] == 50); + CPPUNIT_ASSERT(result->lengths[1] == 50); + } + + void test_ShouldClusteringThreeSpecies() { + + DistanceMatrix* dm = new DistanceMatrix(); + dm->appendDistance(0,100); + dm->appendDistance(0,200); + dm->appendDistance(1,300); + //0 1 100 + //0 2 200 + //1 2 300 + + vector species; + species.push_back(new Sequence("s1","AAA", 0)); + species.push_back(new Sequence("s2","CCC", 1)); + species.push_back(new Sequence("s3","DDD", 2)); + + sut = new UPGMA(species, dm); + Cluster* result = sut->performClustering(); + + CPPUNIT_ASSERT(result->branches.size() == 2); + CPPUNIT_ASSERT(result->distance == 125); + CPPUNIT_ASSERT(result->lengths[1] == 125); + CPPUNIT_ASSERT(result->lengths[0] == 75); + + Cluster* left = result->branches[1]; + Cluster* right = result->branches[0]; + + CPPUNIT_ASSERT(left->isSingle()); + CPPUNIT_ASSERT(right->branches.size() == 2); + CPPUNIT_ASSERT(right->distance == 50); + CPPUNIT_ASSERT(right->isSingle() == false); + CPPUNIT_ASSERT(right->branches[0]->isSingle() == true); + CPPUNIT_ASSERT(right->branches[1]->isSingle() == true); + CPPUNIT_ASSERT(right->lengths[0] == 50); + CPPUNIT_ASSERT(right->lengths[1] == 50); + } + + static CppUnit::Test *suite() { + + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("TestUPGMA"); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test1 - ShouldClusteringOneSpecies.", + &TestUPGMA::test_ShouldClusteringOneSpecies)); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test2 - ShouldClusteringTwoSpecies.", + &TestUPGMA::test_ShouldClusteringTwoSpecies)); + + suiteOfTests->addTest(new CppUnit::TestCaller("Test3 - ShouldClusteringThreeSpecies.", + &TestUPGMA::test_ShouldClusteringThreeSpecies)); + + return suiteOfTests; + } + +}; diff --git a/Phylogenesis/Tests/data/distanceMatrix.dat b/Phylogenesis/Tests/data/distanceMatrix.dat new file mode 100644 index 0000000..91080dc --- /dev/null +++ b/Phylogenesis/Tests/data/distanceMatrix.dat @@ -0,0 +1,3 @@ +0 1 0.10 +0 2 0.20 +1 2 0.30 \ No newline at end of file diff --git a/data/blosum30.dat b/data/blosum30.dat new file mode 100644 index 0000000..257d9b3 --- /dev/null +++ b/data/blosum30.dat @@ -0,0 +1,30 @@ +ARNDCQEGHILKMFPSTWYVUBZX- +25 +25 4 -1 0 0 -3 1 0 0 -2 0 -1 0 1 -2 -1 1 1 -5 -4 1 0 0 0 0 -7 +25 -1 8 -2 -1 -2 3 -1 -2 -1 -3 -2 1 0 -1 -1 -1 -3 0 0 -1 0 -2 0 -1 -7 +25 0 -2 8 1 -1 -1 -1 0 -1 0 -2 0 0 -1 -3 0 1 -7 -4 -2 0 4 -1 0 -7 +25 0 -1 1 9 -3 -1 1 -1 -2 -4 -1 0 -3 -5 -1 0 -1 -4 -1 -2 0 5 0 -1 -7 +25 -3 -2 -1 -3 17 -2 1 -4 -5 -2 0 -3 -2 -3 -3 -2 -2 -2 -6 -2 0 -2 0 -2 -7 +25 1 3 -1 -1 -2 8 2 -2 0 -2 -2 0 -1 -3 0 -1 0 -1 -1 -3 0 -1 4 0 -7 +25 0 -1 -1 1 1 2 6 -2 0 -3 -1 2 -1 -4 1 0 -2 -1 -2 -3 0 0 5 -1 -7 +25 0 -2 0 -1 -4 -2 -2 8 -3 -1 -2 -1 -2 -3 -1 0 -2 1 -3 -3 0 0 -2 -1 -7 +25 -2 -1 -1 -2 -5 0 0 -3 14 -2 -1 -2 2 -3 1 -1 -2 -5 0 -3 0 -2 0 -1 -7 +25 0 -3 0 -4 -2 -2 -3 -1 -2 6 2 -2 1 0 -3 -1 0 -3 -1 4 0 -2 -3 0 -7 +25 -1 -2 -2 -1 0 -2 -1 -2 -1 2 4 -2 2 2 -3 -2 0 -2 3 1 0 -1 -1 0 -7 +25 0 1 0 0 -3 0 2 -1 -2 -2 -2 4 2 -1 1 0 -1 -2 -1 -2 0 0 1 0 -7 +25 1 0 0 -3 -2 -1 -1 -2 2 1 2 2 6 -2 -4 -2 0 -3 -1 0 0 -2 -1 0 -7 +25 -2 -1 -1 -5 -3 -3 -4 -3 -3 0 2 -1 -2 10 -4 -1 -2 1 3 1 0 -3 -4 -1 -7 +25 -1 -1 -3 -1 -3 0 1 -1 1 -3 -3 1 -4 -4 11 -1 0 -3 -2 -4 0 -2 0 -1 -7 +25 1 -1 0 0 -2 -1 0 0 -1 -1 -2 0 -2 -1 -1 4 2 -3 -2 -1 0 0 -1 0 -7 +25 1 -3 1 -1 -2 0 -2 -2 -2 0 0 -1 0 -2 0 2 5 -5 -1 1 0 0 -1 0 -7 +25 -5 0 -7 -4 -2 -1 -1 1 -5 -3 -2 -2 -3 1 -3 -3 -5 20 5 -3 0 -5 -1 -2 -7 +25 -4 0 -4 -1 -6 -1 -2 -3 0 -1 3 -1 -1 3 -2 -2 -1 5 9 1 0 -3 -2 -1 -7 +25 1 -1 -2 -2 -2 -3 -3 -3 -3 4 1 -2 0 1 -4 -1 1 -3 1 5 0 -2 -3 0 -7 +25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +25 0 -2 4 5 -2 -1 0 0 -2 -2 -1 0 -2 -3 -2 0 0 -5 -3 -2 0 5 0 -1 -7 +25 0 0 -1 0 0 4 5 -2 0 -3 -1 1 -1 -4 0 -1 -1 -1 -2 -3 0 0 4 0 -7 +25 0 -1 0 -1 -2 0 -1 -1 -1 0 0 0 0 -1 -1 0 0 -2 -1 0 0 -1 0 -1 -7 +25 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 0 -7 -7 -7 1 +# + + diff --git a/data/blosum45.dat b/data/blosum45.dat index e3fa79d..d4d11a4 100644 --- a/data/blosum45.dat +++ b/data/blosum45.dat @@ -1,30 +1,28 @@ -ARNDCQEGHILKMFPSTWYVUBZX- +ARNDCQEGHILKMFPSTWYVUBZX- 25 -25 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 0 0 0 -4 -25 -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -3 0 0 0 -4 -25 -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 -2 0 0 0 -4 -25 -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 -3 0 0 0 -4 -25 -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 12 0 0 0 -4 -25 -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 -3 0 0 0 -4 -25 -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 -3 0 0 0 -4 -25 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -3 0 0 0 -4 -25 -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 -3 0 0 0 -4 -25 -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 0 0 0 -4 -25 -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -2 0 0 0 -4 -25 -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 -3 0 0 0 -4 -25 -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 0 0 0 -4 -25 -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -2 0 0 0 -4 -25 -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -4 0 0 0 -4 -25 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 -1 0 0 0 -4 -25 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 -1 0 0 0 -4 -25 -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -5 0 0 0 -4 -25 -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -3 0 0 0 -4 -25 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -1 0 0 0 -4 -25 -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 12 0 0 0 -4 +25 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 0 -1 -1 0 -5 +25 -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 0 -1 0 -1 -5 +25 -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 0 4 0 -1 -5 +25 -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 0 5 1 -1 -5 +25 -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 0 -2 -3 -2 -5 +25 -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 0 4 -1 -5 +25 -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 0 1 4 -1 -5 +25 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 0 -1 -2 -1 -5 +25 -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 0 -1 -5 +25 -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 0 -3 -3 -1 -5 +25 -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 0 -3 -2 -1 -5 +25 -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 0 1 -1 -5 +25 -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 0 -2 -1 -1 -5 +25 -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 0 -3 -3 -1 -5 +25 -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 0 -2 -1 -1 -5 +25 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0 0 -5 +25 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 0 -1 0 -5 +25 -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 0 -4 -2 -2 -5 +25 -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 0 -2 -2 -1 -5 +25 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 0 -3 -3 -1 -5 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 0 0 0 0 1 -# - - +25 -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 0 4 2 -1 -5 +25 -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 0 2 4 -1 -5 +25 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 0 -1 -1 -1 -5 +25 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 0 -5 -5 -5 1 +# \ No newline at end of file diff --git a/data/blosum62.dat b/data/blosum62.dat index 73747c6..d2a8f15 100644 --- a/data/blosum62.dat +++ b/data/blosum62.dat @@ -1,31 +1,28 @@ -ARNDCQEGHILKMFPSTWYVUBZX- +ARNDCQEGHILKMFPSTWYVUBZX- 25 -25 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 0 0 0 -4 -25 -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -3 0 0 0 -4 -25 -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 -3 0 0 0 -4 -25 -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 -3 0 0 0 -4 -25 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 9 0 0 0 -4 -25 -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 -3 0 0 0 -4 -25 -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 -4 0 0 0 -4 -25 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -3 0 0 0 -4 -25 -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 -3 0 0 0 -4 -25 -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -1 0 0 0 -4 -25 -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -1 0 0 0 -4 -25 -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 -3 0 0 0 -4 -25 -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -1 0 0 0 -4 -25 -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -2 0 0 0 -4 -25 -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -3 0 0 0 -4 -25 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 -1 0 0 0 -4 -25 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 0 0 0 -4 -25 -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -2 0 0 0 -4 -25 -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -2 0 0 0 -4 -25 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -1 0 0 0 -4 -25 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 9 0 0 0 -4 +25 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 -2 -1 0 -4 +25 -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 -1 0 -1 -4 +25 -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 3 0 -1 -4 +25 -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 4 1 -1 -4 +25 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 -3 -3 -2 -4 +25 -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 0 3 -1 -4 +25 -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 1 4 -1 -4 +25 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 -1 -2 -1 -4 +25 -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 0 -1 -4 +25 -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 -3 -3 -1 -4 +25 -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 -4 -3 -1 -4 +25 -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 0 1 -1 -4 +25 -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 -3 -1 -1 -4 +25 -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 -3 -3 -1 -4 +25 -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 -2 -1 -2 -4 +25 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 0 -4 +25 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 -1 -1 0 -4 +25 -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 -4 -3 -2 -4 +25 -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 -3 -2 -1 -4 +25 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 -3 -2 -1 -4 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 0 0 0 0 1 - -# - - +25 -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 0 4 1 -1 -4 +25 -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 0 1 4 -1 -4 +25 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 0 -1 -1 -1 -4 +25 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 0 -4 -4 -4 1 +# diff --git a/data/blosum80.dat b/data/blosum80.dat index 3379596..564523c 100644 --- a/data/blosum80.dat +++ b/data/blosum80.dat @@ -1,30 +1,30 @@ ARNDCQEGHILKMFPSTWYVUBZX- 25 -25 5 -2 -2 -2 -1 -1 -1 0 -2 -2 -2 -1 -1 -3 -1 1 0 -3 -2 0 -1 0 0 0 -4 -25 -2 6 -1 -2 -4 1 -1 -3 0 -3 -3 2 -2 -4 -2 -1 -1 -4 -3 -3 -4 0 0 0 -4 -25 -2 -1 6 1 -3 0 -1 -1 0 -4 -4 0 -3 -4 -3 0 0 -4 -3 -4 -3 0 0 0 -4 -25 -2 -2 1 6 -4 -1 1 -2 -2 -4 -5 -1 -4 -4 -2 -1 -1 -6 -4 -4 -4 0 0 0 -4 -25 -1 -4 -3 -4 9 -4 -5 -4 -4 -2 -2 -4 -2 -3 -4 -2 -1 -3 -3 -1 9 0 0 0 -4 -25 -1 1 0 -1 -4 6 2 -2 1 -3 -3 1 0 -4 -2 0 -1 -3 -2 -3 -4 0 0 0 -4 -25 -1 -1 -1 1 -5 2 6 -3 0 -4 -4 1 -2 -4 -2 0 -1 -4 -3 -3 -5 0 0 0 -4 -25 0 -3 -1 -2 -4 -2 -3 6 -3 -5 -4 -2 -4 -4 -3 -1 -2 -4 -4 -4 -4 0 0 0 -4 -25 -2 0 0 -2 -4 1 0 -3 8 -4 -3 -1 -2 -2 -3 -1 -2 -3 2 -4 -4 0 0 0 -4 -25 -2 -3 -4 -4 -2 -3 -4 -5 -4 5 1 -3 1 -1 -4 -3 -1 -3 -2 3 -2 0 0 0 -4 -25 -2 -3 -4 -5 -2 -3 -4 -4 -3 1 4 -3 2 0 -3 -3 -2 -2 -2 1 -2 0 0 0 -4 -25 -1 2 0 -1 -4 1 1 -2 -1 -3 -3 5 -2 -4 -1 -1 -1 -4 -3 -3 -4 0 0 0 -4 -25 -1 -2 -3 -4 -2 0 -2 -4 -2 1 2 -2 6 0 -3 -2 -1 -2 -2 1 -2 0 0 0 -4 -25 -3 -4 -4 -4 -3 -4 -4 -4 -2 -1 0 -4 0 6 -4 -3 -2 0 3 -1 -3 0 0 0 -4 -25 -1 -2 -3 -2 -4 -2 -2 -3 -3 -4 -3 -1 -3 -4 8 -1 -2 -5 -4 -3 -4 0 0 0 -4 -25 1 -1 0 -1 -2 0 0 -1 -1 -3 -3 -1 -2 -3 -1 5 1 -4 -2 -2 -2 0 0 0 -4 -25 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -2 -1 -1 -2 -2 1 5 -4 -2 0 -1 0 0 0 -4 -25 -3 -4 -4 -6 -3 -3 -4 -4 -3 -3 -2 -4 -2 0 -5 -4 -4 11 2 -3 -3 0 0 0 -4 -25 -2 -3 -3 -4 -3 -2 -3 -4 2 -2 -2 -3 -2 3 -4 -2 -2 2 7 -2 -3 0 0 0 -4 -25 0 -3 -4 -4 -1 -3 -3 -4 -4 3 1 -3 1 -1 -3 -2 0 -3 -2 4 -1 0 0 0 -4 -25 -1 -4 -3 -4 9 -4 -5 -4 -4 -2 -2 -4 -2 -3 -4 -2 -1 -3 -3 -1 9 0 0 0 -4 +25 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 0 -3 -2 -1 -8 +25 -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 0 -2 0 -2 -8 +25 -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 0 5 -1 -2 -8 +25 -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 0 6 1 -3 -8 +25 -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 0 -6 -7 -4 -8 +25 -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 0 -1 5 -2 -8 +25 -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 0 1 6 -2 -8 +25 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 0 -2 -4 -3 -8 +25 -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 0 -1 0 -2 -8 +25 -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 0 -6 -6 -2 -8 +25 -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 0 -7 -5 -2 -8 +25 -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 0 -1 1 -2 -8 +25 -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 0 -5 -3 -2 -8 +25 -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 0 -6 -6 -3 -8 +25 -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 0 -4 -2 -3 -8 +25 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 0 -1 -1 -8 +25 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 0 -1 -2 -1 -8 +25 -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 0 -8 -5 -5 -8 +25 -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 0 -5 -4 -3 -8 +25 -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 0 -6 -4 -2 -8 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -25 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 0 0 0 0 1 +25 -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 0 6 0 -3 -8 +25 -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 0 6 -1 -8 +25 -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 0 -3 -1 -2 -8 +25 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 0 -8 -8 -8 1 # diff --git a/data/pam120.dat b/data/pam120.dat new file mode 100644 index 0000000..c5b7553 --- /dev/null +++ b/data/pam120.dat @@ -0,0 +1,28 @@ +ARNDCQEGHILKMFPSTWYVUBZX- +25 +25 3 -3 -1 0 -3 -1 0 1 -3 -1 -3 -2 -2 -4 1 1 1 -7 -4 0 0 0 -1 -1 -8 +25 -3 6 -1 -3 -4 1 -3 -4 1 -2 -4 2 -1 -5 -1 -1 -2 1 -5 -3 0 -2 -1 -2 -8 +25 -1 -1 4 2 -5 0 1 0 2 -2 -4 1 -3 -4 -2 1 0 -4 -2 -3 0 3 0 -1 -8 +25 0 -3 2 5 -7 1 3 0 0 -3 -5 -1 -4 -7 -3 0 -1 -8 -5 -3 0 4 3 -2 -8 +25 -3 -4 -5 -7 9 -7 -7 -4 -4 -3 -7 -7 -6 -6 -4 0 -3 -8 -1 -3 0 -6 -7 -4 -8 +25 -1 1 0 1 -7 6 2 -3 3 -3 -2 0 -1 -6 0 -2 -2 -6 -5 -3 0 0 4 -1 -8 +25 0 -3 1 3 -7 2 5 -1 -1 -3 -4 -1 -3 -7 -2 -1 -2 -8 -5 -3 0 3 4 -1 -8 +25 1 -4 0 0 -4 -3 -1 5 -4 -4 -5 -3 -4 -5 -2 1 -1 -8 -6 -2 0 0 -2 -2 -8 +25 -3 1 2 0 -4 3 -1 -4 7 -4 -3 -2 -4 -3 -1 -2 -3 -3 -1 -3 0 1 1 -2 -8 +25 -1 -2 -2 -3 -3 -3 -3 -4 -4 6 1 -3 1 0 -3 -2 0 -6 -2 3 0 -3 -3 -1 -8 +25 -3 -4 -4 -5 -7 -2 -4 -5 -3 1 5 -4 3 0 -3 -4 -3 -3 -2 1 0 -4 -3 -2 -8 +25 -2 2 1 -1 -7 0 -1 -3 -2 -3 -4 5 0 -7 -2 -1 -1 -5 -5 -4 0 0 -1 -2 -8 +25 -2 -1 -3 -4 -6 -1 -3 -4 -4 1 3 0 8 -1 -3 -2 -1 -6 -4 1 0 -4 -2 -2 -8 +25 -4 -5 -4 -7 -6 -6 -7 -5 -3 0 0 -7 -1 8 -5 -3 -4 -1 4 -3 0 -5 -6 -3 -8 +25 1 -1 -2 -3 -4 0 -2 -2 -1 -3 -3 -2 -3 -5 6 1 -1 -7 -6 -2 0 -2 -1 -2 -8 +25 1 -1 1 0 0 -2 -1 1 -2 -2 -4 -1 -2 -3 1 3 2 -2 -3 -2 0 0 -1 -1 -8 +25 1 -2 0 -1 -3 -2 -2 -1 -3 0 -3 -1 -1 -4 -1 2 4 -6 -3 0 0 0 -2 -1 -8 +25 -7 1 -4 -8 -8 -6 -8 -8 -3 -6 -3 -5 -6 -1 -7 -2 -6 12 -2 -8 0 -6 -7 -5 -8 +25 -4 -5 -2 -5 -1 -5 -5 -6 -1 -2 -2 -5 -4 4 -6 -3 -3 -2 8 -3 0 -3 -5 -3 -8 +25 0 -3 -3 -3 -3 -3 -3 -2 -3 3 1 -4 1 -3 -2 -2 0 -8 -3 5 0 -3 -3 -1 -8 +25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +25 0 -2 3 4 -6 0 3 0 1 -3 -4 0 -4 -5 -2 0 0 -6 -3 -3 0 4 2 -1 -8 +25 -1 -1 0 3 -7 4 4 -2 1 -3 -3 -1 -2 -6 -1 -1 -2 -7 -5 -3 0 2 4 -1 -8 +25 -1 -2 -1 -2 -4 -1 -1 -2 -2 -1 -2 -2 -2 -3 -2 -1 -1 -5 -3 -1 0 -1 -1 -2 -8 +25 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 0 -8 -8 -8 1 +# \ No newline at end of file diff --git a/data/pam20.dat b/data/pam20.dat new file mode 100644 index 0000000..3307af1 --- /dev/null +++ b/data/pam20.dat @@ -0,0 +1,28 @@ +ARNDCQEGHILKMFPSTWYVUBZX- +25 +25 6 -8 -5 -4 -8 -5 -3 -3 -8 -6 -7 -8 -6 -9 -2 -1 -1 -16 -9 -3 0 -5 -4 -4 -19 +25 -8 9 -7 -12 -9 -2 -11 -11 -3 -6 -10 -1 -5 -10 -5 -4 -8 -3 -11 -9 0 -9 -5 -7 -19 +25 -5 -7 8 1 -13 -5 -3 -4 -1 -6 -8 -2 -11 -10 -7 -1 -3 -9 -5 -9 0 6 -4 -4 -19 +25 -4 -12 1 8 -16 -4 2 -4 -5 -9 -15 -6 -13 -17 -9 -5 -6 -17 -13 -9 0 6 0 -7 -19 +25 -8 -9 -13 -16 10 -16 -16 -11 -8 -7 -17 -16 -16 -15 -9 -4 -9 -18 -5 -7 0 -14 -16 -11 -19 +25 -5 -2 -5 -4 -16 9 0 -8 0 -9 -6 -4 -5 -15 -4 -6 -7 -15 -14 -8 0 -4 7 -6 -19 +25 -3 -11 -3 2 -16 0 8 -5 -6 -6 -10 -5 -8 -16 -7 -5 -7 -19 -9 -8 0 0 6 -6 -19 +25 -3 -11 -4 -4 -11 -8 -5 7 -10 -13 -12 -8 -10 -10 -7 -3 -7 -17 -16 -7 0 -4 -6 -6 -19 +25 -8 -3 -1 -5 -8 0 -6 -10 9 -11 -7 -8 -13 -7 -5 -7 -8 -8 -4 -7 0 -2 -2 -6 -19 +25 -6 -6 -6 -9 -7 -9 -6 -13 -11 9 -2 -7 -2 -3 -10 -8 -3 -16 -7 1 0 -7 -7 -6 -19 +25 -7 -10 -8 -15 -17 -6 -10 -12 -7 -2 7 -9 0 -4 -8 -9 -8 -7 -8 -3 0 -10 -8 -7 -19 +25 -8 -1 -2 -6 -16 -4 -5 -8 -8 -7 -9 7 -3 -16 -8 -5 -4 -14 -10 -10 0 -3 -5 -6 -19 +25 -6 -5 -11 -13 -16 -5 -8 -10 -13 -2 0 -3 11 -5 -9 -6 -5 -15 -13 -2 0 -12 -6 -6 -19 +25 -9 -10 -10 -17 -15 -15 -16 -10 -7 -3 -4 -16 -5 9 -11 -7 -10 -6 1 -9 0 -12 -16 -9 -19 +25 -2 -5 -7 -9 -9 -4 -7 -7 -5 -10 -8 -8 -9 -11 8 -3 -5 -16 -16 -7 0 -8 -5 -6 -19 +25 -1 -4 -1 -5 -4 -6 -5 -3 -7 -8 -9 -5 -6 -7 -3 7 0 -6 -8 -8 0 -2 -6 -4 -19 +25 -1 -8 -3 -6 -9 -7 -7 -7 -8 -3 -8 -4 -5 -10 -5 0 7 -15 -7 -4 0 -4 -7 -5 -19 +25 -16 -3 -9 -17 -18 -15 -19 -17 -8 -16 -7 -14 -15 -6 -16 -6 -15 13 -6 -18 0 -11 -17 -13 -19 +25 -9 -11 -5 -13 -5 -14 -9 -16 -4 -7 -8 -10 -13 1 -16 -8 -7 -6 10 -8 0 -7 -11 -9 -19 +25 -3 -9 -9 -9 -7 -8 -8 -7 -7 1 -3 -10 -2 -9 -7 -8 -4 -18 -8 7 0 -9 -8 -6 -19 +25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +25 -5 -9 6 6 -14 -4 0 -4 -2 -7 -10 -3 -12 -12 -8 -2 -4 -11 -7 -9 0 6 -1 -6 -19 +25 -4 -5 -4 0 -16 7 6 -6 -2 -7 -8 -5 -6 -16 -5 -6 -7 -17 -11 -8 0 -1 6 -6 -19 +25 -4 -7 -4 -7 -11 -6 -6 -6 -6 -6 -7 -6 -6 -9 -6 -4 -5 -13 -9 -6 0 -6 -6 -6 -19 +25 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 -19 0 -19 -19 -19 1 +# \ No newline at end of file diff --git a/data/pam350.dat b/data/pam350.dat new file mode 100644 index 0000000..6ec5db3 --- /dev/null +++ b/data/pam350.dat @@ -0,0 +1,28 @@ +ARNDCQEGHILKMFPSTWYVUBZX- +25 +25 2 -1 0 1 -2 0 1 2 -1 0 -2 -1 -1 -4 1 1 1 -7 -4 0 0 1 0 0 -10 +25 -1 7 1 -1 -4 2 0 -2 2 -2 -3 4 0 -5 0 0 -1 4 -5 -3 0 0 1 -1 -10 +25 0 1 2 2 -4 1 2 1 2 -2 -3 1 -2 -4 0 1 1 -5 -3 -2 0 2 2 0 -10 +25 1 -1 2 4 -6 2 4 1 1 -2 -4 1 -3 -6 0 1 0 -8 -5 -2 0 3 3 -1 -10 +25 -2 -4 -4 -6 18 -6 -6 -4 -4 -3 -7 -6 -6 -5 -3 0 -2 -10 1 -2 0 -5 -6 -3 -10 +25 0 2 1 2 -6 4 3 -1 3 -2 -2 1 -1 -5 1 0 0 -5 -5 -2 0 2 3 0 -10 +25 1 0 2 4 -6 3 4 1 1 -2 -4 0 -2 -6 0 0 0 -8 -5 -2 0 3 3 0 -10 +25 2 -2 1 1 -4 -1 1 5 -2 -2 -4 -1 -3 -6 0 1 1 -8 -6 -1 0 1 0 -1 -10 +25 -1 2 2 1 -4 3 1 -2 7 -2 -2 1 -2 -2 0 -1 -1 -3 0 -2 0 1 2 0 -10 +25 0 -2 -2 -2 -3 -2 -2 -2 -2 5 4 -2 3 2 -2 -1 0 -6 0 4 0 -2 -2 0 -10 +25 -2 -3 -3 -4 -7 -2 -4 -4 -2 4 8 -3 5 3 -3 -3 -2 -2 0 3 0 -4 -3 -1 -10 +25 -1 4 1 1 -6 1 0 -1 1 -2 -3 5 0 -6 -1 0 0 -4 -5 -2 0 1 1 -1 -10 +25 -1 0 -2 -3 -6 -1 -2 -3 -2 3 5 0 6 1 -2 -2 -1 -5 -2 2 0 -2 -2 0 -10 +25 -4 -5 -4 -6 -5 -5 -6 -6 -2 2 3 -6 1 13 -5 -4 -3 1 11 -1 0 -5 -6 -2 -10 +25 1 0 0 0 -3 1 0 0 0 -2 -3 -1 -2 -5 6 1 1 -7 -6 -1 0 0 0 0 -10 +25 1 0 1 1 0 0 0 1 -1 -1 -3 0 -2 -4 1 1 1 -3 -3 -1 0 1 0 0 -10 +25 1 -1 1 0 -2 0 0 1 -1 0 -2 0 -1 -3 1 1 2 -6 -3 0 0 0 0 0 -10 +25 -7 4 -5 -8 -10 -5 -8 -8 -3 -6 -2 -4 -5 1 -7 -3 -6 27 1 -7 0 -6 -7 -5 -10 +25 -4 -5 -3 -5 1 -5 -5 -6 0 0 0 -5 -2 11 -6 -3 -3 1 14 -2 0 -4 -5 -2 -10 +25 0 -3 -2 -2 -2 -2 -2 -1 -2 4 3 -2 2 -1 -1 -1 0 -7 -2 5 0 -2 -2 0 -10 +25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +25 1 0 2 3 -5 2 3 1 1 -2 -4 1 -2 -5 0 1 0 -6 -4 -2 0 3 2 0 -10 +25 0 1 2 3 -6 3 3 0 2 -2 -3 1 -2 -6 0 0 0 -7 -5 -2 0 2 3 0 -10 +25 0 -1 0 -1 -3 0 0 -1 0 0 -1 -1 0 -2 0 0 0 -5 -2 0 0 0 0 -1 -10 +25 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 0 -10 -10 -10 1 +# \ No newline at end of file diff --git a/data/pam60.dat b/data/pam60.dat new file mode 100644 index 0000000..97c6cde --- /dev/null +++ b/data/pam60.dat @@ -0,0 +1,28 @@ +ARNDCQEGHILKMFPSTWYVUBZX- +25 +25 5 -5 -2 -2 -5 -3 -1 0 -5 -3 -4 -5 -3 -6 0 1 1 -10 -6 -1 0 -2 -2 -2 -12 +25 -5 8 -3 -6 -6 0 -6 -7 0 -4 -6 2 -2 -7 -2 -2 -4 0 -8 -5 0 -5 -2 -4 -12 +25 -2 -3 6 2 -7 -2 0 -1 1 -4 -5 0 -6 -6 -4 1 -1 -6 -3 -5 0 5 -1 -2 -12 +25 -2 -6 2 7 -10 -1 3 -2 -2 -5 -9 -2 -7 -11 -5 -2 -3 -11 -8 -6 0 5 2 -3 -12 +25 -5 -6 -7 -10 9 -10 -10 -7 -6 -4 -11 -10 -10 -9 -6 -1 -5 -12 -2 -4 0 -9 -10 -6 -12 +25 -3 0 -2 -1 -10 7 2 -5 2 -5 -3 -1 -2 -9 -1 -3 -4 -9 -8 -5 0 -1 6 -3 -12 +25 -1 -6 0 3 -10 2 7 -2 -3 -4 -7 -3 -5 -10 -3 -2 -4 -12 -7 -4 0 2 5 -3 -12 +25 0 -7 -1 -2 -7 -5 -2 6 -6 -7 -8 -5 -6 -7 -4 0 -3 -11 -10 -4 0 -2 -3 -3 -12 +25 -5 0 1 -2 -6 2 -3 -6 8 -6 -4 -4 -7 -4 -2 -4 -5 -5 -2 -5 0 0 0 -3 -12 +25 -3 -4 -4 -5 -4 -5 -4 -7 -6 7 0 -4 1 -1 -6 -4 -1 -10 -4 3 0 -4 -4 -3 -12 +25 -4 -6 -5 -9 -11 -3 -7 -8 -4 0 6 -6 2 -1 -5 -6 -5 -4 -5 -1 0 -7 -5 -4 -12 +25 -5 2 0 -2 -10 -1 -3 -5 -4 -4 -6 6 0 -10 -4 -2 -2 -8 -7 -6 0 -1 -2 -3 -12 +25 -3 -2 -6 -7 -10 -2 -5 -6 -7 1 2 0 10 -2 -6 -4 -2 -9 -7 0 0 -6 -4 -3 -12 +25 -6 -7 -6 -11 -9 -9 -10 -7 -4 -1 -1 -10 -2 8 -7 -5 -6 -3 3 -5 0 -8 -10 -5 -12 +25 0 -2 -4 -5 -6 -1 -3 -4 -2 -6 -5 -4 -6 -7 7 0 -2 -10 -10 -4 0 -4 -2 -3 -12 +25 1 -2 1 -2 -1 -3 -2 0 -4 -4 -6 -2 -4 -5 0 5 1 -4 -5 -4 0 0 -3 -2 -12 +25 1 -4 -1 -3 -5 -4 -4 -3 -5 -1 -5 -2 -2 -6 -2 1 6 -9 -5 -1 0 -2 -4 -2 -12 +25 -10 0 -6 -11 -12 -9 -12 -11 -5 -10 -4 -8 -9 -3 -10 -4 -9 13 -3 -11 0 -8 -11 -8 -12 +25 -6 -8 -3 -8 -2 -8 -7 -10 -2 -4 -5 -7 -7 3 -10 -5 -5 -3 9 -5 0 -5 -7 -5 -12 +25 -1 -5 -5 -6 -4 -5 -4 -4 -5 3 -1 -6 0 -5 -4 -4 -1 -11 -5 6 0 -5 -5 -3 -12 +25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +25 -2 -5 5 5 -9 -1 2 -2 0 -4 -7 -1 -6 -8 -4 0 -2 -8 -5 -5 0 5 1 -3 -12 +25 -2 -2 -1 2 -10 6 5 -3 0 -4 -5 -2 -4 -10 -2 -3 -4 -11 -7 -5 0 1 5 -3 -12 +25 -2 -4 -2 -3 -6 -3 -3 -3 -3 -3 -4 -3 -3 -5 -3 -2 -2 -8 -5 -3 0 -3 -3 -3 -12 +25 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 -12 0 -12 -12 -12 1 +# \ No newline at end of file diff --git a/samples/cyc_c1.fasta b/samples/cyc_c1.fasta new file mode 100644 index 0000000..43842fa --- /dev/null +++ b/samples/cyc_c1.fasta @@ -0,0 +1,146 @@ +>sp|P00125|CY1_BOVIN Cytochrome c1, heme protein, mitochondrial OS=Bos taurus GN=CYC1 PE=1 SV=2 +MAAAAATLRGAMVGPRGAGLPGARARGLLCGARPGQLPLRTPQAVSLSSKSGLSRGRKVI +LSALGMLAAGGAGLAVALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CSSCHSMDYVAYRHLVGVCYTEDEAKALAEEVEVQDGPNEDGEMFMRPGKLSDYFPKPYP +NPEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIGMAPPIYNEVLEFDDGTPATMSQVAKDVCTFLRWAAEPEHDHRKRMGLKMLLMMGLLL +PLVYAMKRHKWSVLKSRKLAYRPPK +>sp|P08574|CY1_HUMAN Cytochrome c1, heme protein, mitochondrial OS=Homo sapiens GN=CYC1 PE=1 SV=3 +MAAAAASLRGVVLGPRGAGLPGARARGLLCSARPGQLPLRTPQAVALSSKSGLSRGRKVM +LSALGMLAAGGAGLAMALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CASCHSMDFVAYRHLVGVCYTEDEAKELAAEVEVQDGPNEDGEMFMRPGKLFDYFPKPYP +NSEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIAMAPPIYTDVLEFDDGTPATMSQIAKDVCTFLRWASEPEHDHRKRMGLKMLMMMALLV +PLVYTIKRHKWSVLKSRKLAYRPPK +>sp|P00044|CYC1_YEAST Cytochrome c iso-1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=CYC1 PE=1 SV=2 +MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDANIK +KNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE +>sp|P30183|CCB11_ARATH Cyclin-B1-1 OS=Arabidopsis thaliana GN=CYCB1-1 PE=1 SV=2 +MMTSRSIVPQQSTDDVVVVDGKNVAKGRNRQVLGDIGNVVRGNYPKNNEPEKINHRPRTR +SQNPTLLVEDNLKKPVVKRNAVPKPKKVAGKPKVVDVIEISSDSDEELGLVAAREKKATK +KKATTYTSVLTARSKAACGLEKKQKEKIVDIDSADVENDLAAVEYVEDIYSFYKSVESEW +RPRDYMASQPDINEKMRLILVEWLIDVHVRFELNPETFYLTVNILDRFLSVKPVPRKELQ +LVGLSALLMSAKYEEIWPPQVEDLVDIADHAYSHKQILVMEKTILSTLEWYLTVPTHYVF +LARFIKASIADEKMENMVHYLAELGVMHYDTMIMFSPSMVAASAIYAARSSLRQVPIWTS +TLKHHTGYSETQLMDCAKLLAYQQWKQQEEGSESSTKGALRKKYSKDERFAVALIPPAKA +LLTGTESA +>sp|Q0JF02|CPS4_ORYSJ Syn-copalyl diphosphate synthase OS=Oryza sativa subsp. japonica GN=CPS4 PE=2 SV=1 +MPVFTASFQCVTLFGQPASAADAQPLLQGQRPFLHLHARRRRPCGPMLISKSPPYPASEE +TREWEADGQHEHTDELRETTTTMIDGIRTALRSIGEGEISISAYDTSLVALLKRLDGGDG +PQFPSTIDWIVQNQLPDGSWGDASFFMMGDRIMSTLACVVALKSWNIHTDKCERGLLFIQ +ENMWRLAHEEEDWMLVGFEIALPSLLDMAKDLDLDIPYDEPALKAIYAERERKLAKIPRD +VLHSMPTTLLHSLEGMVDLDWEKLLKLRCLDGSFHCSPASTATAFQQTGDQKCFEYLDGI +VKKFNGGVPCIYPLDVYERLWAVDRLTRLGISRHFTSEIEDCLDYIFRNWTPDGLAHTKN +CPVKDIDDTAMGFRLLRLYGYQVDPCVLKKFEKDGKFFCLHGESNPSSVTPMYNTYRASQ +LKFPGDDGVLGRAEVFCRSFLQDRRGSNRMKDKWAIAKDIPGEVEYAMDYPWKASLPRIE +TRLYLDQYGGSGDVWIGKVLHRMTLFCNDLYLKAAKADFSNFQKECRVELNGLRRWYLRS +NLEKFGGTDPQTTLMTSYFLASANIFEANRAAERLGWARVALLADAVSSHFRRIGGPKNS +TSNLEELISLVPFDDAYSGSLREAWKQWLMAWTAKESSQESIEGDTAILLVRAIEIFGGR +HVLTGQRPDLWEYSQLEQLTSSICCKLSRRVLAQENGESTEKVEEIDQQVDLEMQELTRR +VLQGCSAINRLTRETFLHVVKSFCYVAYCSPETIDSHIDKVIFQDVI +>sp|P00048|CYC_NEUCR Cytochrome c OS=Neurospora crassa (strain ATCC 24698 / 74-OR23-1A / CBS 708.71 / DSM 1257 / FGSC 987) GN=cyc-1 PE=1 SV=2 +MGFSAGDSKKGANLFKTRCAQCHTLEEGGGNKIGPALHGLFGRKTGSVDGYAYTDANKQK +GITWDENTLFEYLENPKKYIPGTKMAFGGLKKDKDRNDIITFMKEATA +>sp|Q9D0M3|CY1_MOUSE Cytochrome c1, heme protein, mitochondrial OS=Mus musculus GN=Cyc1 PE=1 SV=1 +MAAAAASLRRTVLGPRGVGLPGASAPGLLGGARSRQLPLRTPQAVSLSSKSGPSRGRKVM +LSALGMLAAGGAGLAVALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CSSCHSMDYVAYRHLVGVCYTEEEAKALAEEVEVQDGPNDDGEMFMRPGKLSDYFPKPYP +NPEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIGMAPPIYTEVLEYDDGTPATMSQVAKDVATFLRWASEPEHDHRKRMGLKMLLMMGLLL +PLTYAMKRHKWSVLKSRKLAYRPPK +>sp|Q9K499|CYC1_STRCO Epi-isozizaene synthase OS=Streptomyces coelicolor (strain ATCC BAA-471 / A3(2) / M145) GN=cyc1 PE=1 SV=1 +MHAFPHGTTATPTAIAVPPSLRLPVIEAAFPRQLHPYWPKLQETTRTWLLEKRLMPADKV +EEYADGLCYTDLMAGYYLGAPDEVLQAIADYSAWFFVWDDRHDRDIVHGRAGAWRRLRGL +LHTALDSPGDHLHHEDTLVAGFADSVRRLYAFLPATWNARFARHFHTVIEAYDREFHNRT +RGIVPGVEEYLELRRLTFAHWIWTDLLEPSSGCELPDAVRKHPAYRRAALLSQEFAAWYN +DLCSLPKEIAGDEVHNLGISLITHHSLTLEEAIGEVRRRVEECITEFLAVERDALRFADE +LADGTVRGKELSGAVRANVGNMRNWFSSVYWFHHESGRYMVDSWDDRSTPPYVNNEAAGE +K +>sp|Q753F4|CYC_ASHGO Cytochrome c OS=Ashbya gossypii (strain ATCC 10895 / CBS 109.51 / FGSC 9923 / NRRL Y-1056) GN=CYC1 PE=3 SV=2 +MPAPYKPGSEKKGATLFKTRCLQCHTTEKGGPHKVGPNLHGVFSRQSGQVSGYSYTDAMI +NRNVTWDAQSMSDYLENPKKYIPGTKMAFGGLKKEKDRNDLITYMLKACK +>sp|P25400|CYC_CANGA Cytochrome c OS=Candida glabrata (strain ATCC 2001 / CBS 138 / JCM 3761 / NBRC 0622 / NRRL Y-65) GN=CYC1 PE=3 SV=1 +MSEKKGATLFKTRCLQCHTVEKGGPNKVGPNLHGIFGRKSGQAAGYSYTDANIKKNVTWD +EDNMSDYLTNPKKYIPGTKMAFGGLKKEKDRKDLIAYLKKATSD +>sp|P53698|CYC_CANAL Cytochrome c OS=Candida albicans (strain SC5314 / ATCC MYA-2876) GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGVFGRKSGLAEGYSYTDANK +KKGVEWTEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLVTYLKKATS +>sp|P15451|CYC_CHLRE Cytochrome c OS=Chlamydomonas reinhardtii GN=CYC1 PE=3 SV=2 +MSTFAEAPAGDLARGEKIFKTKCAQCHVAEKGGGHKQGPNLGGLFGRVSGTAAGFAYSKA +NKEAAVTWGESTLYEYLLNPKKYMPGNKMVFAGLKKPEERADLIAYLKQATA +>sp|P00043|CYC_DEBHA Cytochrome c OS=Debaryomyces hansenii (strain ATCC 36239 / CBS 767 / JCM 1990 / NBRC 0083 / IGC 2968) GN=CYC1 PE=1 SV=3 +MPAPYEKGSEKKGANLFKTRCLQCHTVEEGGPHKVGPNLHGVVGRTSGQAQGFSYTDANK +KKGVEWSEQNLSDYLENPKKYIPGTKMAFGGLKKAKDRNDLISYLVKATK +>sp|P00041|CYC_ISSOR Cytochrome c OS=Issatchenkia orientalis GN=CYC1 PE=1 SV=1 +PAPFEQGSAKKGATLFKTRCAQCHTIEAGGPHKVGPNLHGIFSRHSGQAEGYSYTDANKR +AGVEWAEPTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLVTYMLEASK +>sp|O13393|CYC_PICST Cytochrome c OS=Scheffersomyces stipitis (strain ATCC 58785 / CBS 6054 / NBRC 10063 / NRRL Y-11545) GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEEGGPHKVGPNLHGIMGRKSGQAVGYSYTDANK +KKGVEWSEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLVTYLASATK +>sp|P00046|CYC_SCHPO Cytochrome c OS=Schizosaccharomyces pombe (strain 972 / ATCC 24843) GN=cyc1 PE=1 SV=3 +MPYAPGDEKKGASLFKTRCAQCHTVEKGGANKVGPNLHGVFGRKTGQAEGFSYTEANRDK +GITWDEETLFAYLENPKKYIPGTKMAFAGFKKPADRNNVITYLKKATSE +>sp|Q6E7D7|CPS4_ORYSI Syn-copalyl diphosphate synthase OS=Oryza sativa subsp. indica GN=CPS4 PE=2 SV=1 +MPVFTASFQCVTLFGQPASAADAQPLLQGQRPFLHLHARRRRPCGPMLISKSPPYPASEE +TREWEAEGQHEHTDELRETTTTMIDGIRTALRSIGEGEISISAYDTSLVALLKRLDGGDG +PQFPSTIDWIVQNQLPDGSWGDASFFMMGDRIMSTLACVVALKSWNIHTDKCERGLLFIQ +ENMWRLAHEEEDWMLVGFEIALPSLLDMAKDLDLDIPYDEPALKAIYAERERKLAKIPRD +VLHAMPTTLLHSLEGMVDLDWEKLLKLRCLDGSFHCSPASTATAFQQTGDQKCFEYLDGI +VKKFNGGVPCIYPLDVYERLWAVDRLTRLGISRHFTSEIEDCLDYIFRNWTPDGLAHTKN +CPVKDIDDTAMGFRLLRLYGYQVDPCVLKKFEKDGKFFCLHGESNPSSVTPMYNTYRASQ +LKFPGDDGVLGRAEVFCRSFLQDRRGSNRMKDKWAIAKDIPGEVEYAMDYPWKASLPRIE +TRLYLDQYGGSGDVWIGKVLHRMTLFCNDLYLKAAKADFSNFQKECRVELNGLRRWYLRS +NLERFGGTDPQTTLMTSYFLASANIFEPNRAAERLGWARVALLADAVSSHFRRIGGPKNL +TSNLEELISLVPFDDAYSGSLREAWKQWLMAWTAKESSQESIEGDTAILLVRAIEIFGGR +HVLTGQRPDLWEYSQLEQLTSSICRKLYRRVLAQENGKSTEKVEEIDQQLDLEMQELTRR +VLQGCSAINRLTRETFLHVVKSFCYVAYCSPETIDNHIDKVIFQDVI +>sp|Q54D07|CY1_DICDI Cytochrome c1, heme protein, mitochondrial OS=Dictyostelium discoideum GN=cyc1 PE=3 SV=1 +MFSQKLLANGKLLSKLAIVSGVVGTATAITTSTTLLSDDFVQPTSYPWEHRFPWQSYDHA +AIRRGHQVYTQVCSTCHSLNLISYRHLANVAYTPEELKAMAADTTVMDGPDSEGDMFERK +GQVTDFHPKPYPNSNAARFANNGALPPDLSLVIKARGAHEDYVFSLLTGYCEPPAGVRVV +GGQYFNPYFPGTKIAMAPPLADGMVEYDDGTDNSMSQMAKDVSTFLCWASEPEHDDRKKL +GMKVLLGVSIIALPLFYWKRLKWSVIKTRKISFKD +>sp|P74917|CY552_ACIFR Cytochrome c-552 OS=Acidithiobacillus ferrooxidans GN=cyc1 PE=1 SV=2 +MTTYLSQDRLRNKENDTMTYQHSKMYQSRTFLLFSALLLVAGQASAAVGSADAPAPYRVS +SDCMVCHGMTGRDTLYPIVPRLAGQHKSYMEAQLKAYKDHSRADQNGEIYMWPVAQALDS +AKITALADYFNAQKPPMQSSGIKHAGAKEGKAIFNQGVTNEQIPACMECHGSDGQGAGPF +PRLAGQRYGYIIQQLTYFHNGTRVNTLMNQIAKNITVAQMKDVAAYLSSL +>sp|Q9AJE4|CYC1_KITGR Terpentedienyl-diphosphate synthase OS=Kitasatospora griseola GN=cyc1 PE=1 SV=1 +MKDRAADPVTKFSPSPYETGQFLRISERADVGTPQIDYLLATQRPDGLWGSVGFELVPTL +GAVAGLSSRPEYADRAGVTDAVARACEKLWELALGEGGLPKLPDTVASEIIVPSLIDLLS +EVLQRHRPAVGGKAGQEQEFPSPPGANAELWRQLSDRIARGQAIPKTAWHTLEAFHPLPK +QFAATVTPAADGAVTCSPSSTAAWLSAVGTDAGASTRAYLDEAQSRYGGAIPMGSSMPYF +EVLWVLNLVLKYFPDVPIPREIIEEIAAGFSDSGIGGGPGLPPDGDDTAYANLAGDKLGA +PTHPEILMKFWAEDHFVSYPGEQTPSETVNAHALEYLNHLRMRRGITEFGAVEDACAEWV +ISQQTEDGCWYDKWNVSPYYSTAACVEALLDARKQDEPQLDSLRRAREWLLRHQTDSGGW +GMAEPSPEETAYAVLALDLFASRGGEGAEECAAAISRAKEFFTDESRENPPLWMGKDLYT +PFRIVDVTVMCGRAVVGRY +>sp|Q4HVX7|CYC_GIBZE Cytochrome c OS=Gibberella zeae (strain PH-1 / ATCC MYA-4620 / FGSC 9075 / NRRL 31084) GN=CYC1 PE=3 SV=2 +MAGGDIKKGANLFKTRCAQCHTVEKDGGNKIGPALHGLWGRKTGSVEGYSYTDANKQKGI +EWNDDTLFEYLENPKKYIPGTKMAFGGLKKAKDRNDLIAYLKDSTK +>sp|P19681|CYC_SCHOC Cytochrome c OS=Schwanniomyces occidentalis GN=CYC1 PE=3 SV=2 +MPAPYEKGSEKKDANLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRKSGQAAGYSYTDANK +KKGVEWTEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLITYLANATK +>sp|Q6C9Q0|CYC_YARLI Cytochrome c OS=Yarrowia lipolytica (strain CLIB 122 / E 150) GN=CYC1 PE=3 SV=1 +MGYKEGSAKKGATLFKTRCAQCHTTEAGGPHKVGPNLHGVINRHSGEAEGYSYSDANKRK +GIEWTTEHLFEYLENPKKYIPGTKMAFGGLKKPKDRNDLITWMVENC +>sp|P41179|CCN1_TRYBB G2/mitotic-specific cyclin-1 OS=Trypanosoma brucei brucei GN=CYC1 PE=2 SV=1 +MMTNLNVRTIKGCFPGSLPDALNDMSVINHILSLTGRFDEGQAALASTVNVVYVGTAVYD +IPRYREEYTKNFIARGCGISEVCVAEARSTGATPCAAATMVTPDQLQHLAEAHIILLPEV +TPFSLYDAGRKRASMRVSGQVAARGVVLVGGGCCFRAEHSDSANPKTCAQYMLSRENEVD +SGRPVEMEEGGAKWEYLCVHGLSVLPGIFCPQHSSRDATGLLLNESFSKMLKRHPTERGI +GVDCRAVLLLMGDGRYQVLTIANREGRTASVKDINIQIKDVVEGNVQTTTIQQQGSVEEL +LRKPCGPVVRDPFEAYYAMANPTALTEKLLCAPR +>sp|Q06374|CG1_COLGL Putative G1/S-specific cyclin OS=Colletotrichum gloeosporioides GN=CYC1 PE=2 SV=1 +MLKRATVDVSSPADVPQPKRRKAYLTGADQARDEYAEDAIKHMEYIESETTPDKRCIPAS +VLEMRPEVIRSLIKIHKELNLLPQTLLLSVNLLDRFCCKKDFLGQVYYRVFSCAALWIAS +KFVEDKDRVPSVCRVLDYVPSGMYCNSALFFRLEICMLNVLAWRVNHPTPWCFIQLFCPG +EGNKTKVRELALDICTARLPENRWMAIRPSILARHCLLHACNNLFSDTSLVEMYGSMPRD +KPSTH +>sp|O93863|CYC_PACTA Cytochrome c OS=Pachysolen tannophilus GN=CYC1 PE=3 SV=3 +MPAPYEKGSAKKGATLFKTRCLQCHTTEAGGAHKVGPNLNGVFGRHSGQAEGYSYTDANK +QKGALWEAQTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLVTYLLSATK +>sp|Q6Q4H8|CYC_PICPA Cytochrome c OS=Komagataella pastoris GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEAGGPHKVGPNLHGVFGRKSGLAEGYSYTDANK +RKGVEWSEQTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLITYLASATK + diff --git a/samples/distance.dat b/samples/distance.dat new file mode 100644 index 0000000..cb22647 --- /dev/null +++ b/samples/distance.dat @@ -0,0 +1,10 @@ +0 1 0.6667 +0 2 0.75 +0 3 0.7143 +0 4 0.778 +1 2 0.6667 +1 3 0.8333 +1 4 0.8333 +2 3 0.5714 +2 4 0.875 +3 4 0.4286 diff --git a/samples/hem_b.fasta b/samples/hem_b.fasta new file mode 100644 index 0000000..69cbcaf --- /dev/null +++ b/samples/hem_b.fasta @@ -0,0 +1,52 @@ +>Human 2144721 HBHU 4HHB +MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLG +AFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVAN +ALAHKYH +>Human-sickle 2392691 2HBS +VHLTPVEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGA +FSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANA +LAHKYH +>Gorilla 232230 +MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLG +AFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFKLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVAN +ALAHKYH +>Spider Monkey 122567 +VHLTGEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMSNPKVKAHGKKVLGA +FSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPQLQAAYQKVVAGVANA +LAHKYH +>Horse 230633 2MHB +VQLSGEEKAAVLALWDKVNEEEVGGEALGRLLVVYPWTQRFFDSFGDLSNPGAVMGNPKVKAHGKKVLHS +FGEGVHHLDNLKGTFAALSELHCDKLHVDPENFRLLGNVLVVVLARHFGKDFTPELQASYQKVVAGVANA +LAHKYH +>Deer 229975 1HDS +MLTAEEKAAVTGFWGKVDVDVVGAQALGRLLVVYPWTQRFFQHFGNLSSAGAVMNNPKVKAHGKRVLDAF +TQGLKHLDDLKGAFAQLSGLHCNKLHVNPQNFRLLGNVLALVVARNFGGQFTPNVQALFQKVVAGVANAL +AHKYH +>Pig 809283 2PGH +VHLSAEEKEAVLGLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSNADAVMGNPKVKAHGKKVLQS +FSDGLKHLDNLKGTFAKLSELHCDQLHVDPENFRLLGNVIVVVLARRLGHDFNPDVQAAFQKVVAGVANA +LAHKYH +>Cow 576143 1HDA +MLTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKVKAHGKKVLDSF +SNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARNFGKEFTPVLQADFQKVVAGVANAL +AHRYH +>Gull 122618 +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAINGNPMVRAHGKKVLTS +FGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFAKDFTPDSQAAWQKLVRVVAHA +LARKYH +>Trout 1942655 1OUT +VEWTDAEKSTISAVWGKVNIDEIGPLALARVLIVYPWTQRYFGSFGNVSTPAAIMGNPKVAAHGKVVCGA +LDKAVKNMGNILATYKSLSETHANKLFVDPDNFRVLADVLTIVIAAKFGASFTPEIQATWQKFMKVVVAA +MGSRYF +>Rockcod 1065040 1HBH +VEWTDKERSIISDIFSHMDYDDIGPKALSRCLIVYPWTQRHFSGFGNLYNAEAIIGNANVAAHGIKVLHG +LDRGVKNMDNIAATYADLSTLHSEKLHVDPDNFKLLSDCITIVLAAKMGHAFTAETQGAFQKFLAVVVSA +LGKQYH +>Lamprey 230607 2LHB +PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTTADELKKSADV +RWHAERIINAVDDAVASMDDTEKMSMKLRNLSGKHAKSFQVDPEYFKVLAAVIADTVAAGDAGFEKLMSM +ICILLRSAY +>Sea-Cucumber 576151 1HLB +XGGTLAIQAQGDLTLAQKKIVRKTWHQLMRNKTSFVTDVFIRIFAYDPSAQNKFPQMAGMSASQLRSSRQ +MQAHAIRVSSIMSEYVEELDSDILPELLATLARTHDLNKVGADHYNLFAKVLMEALQAELGSDFNEKTRD +AWAKAFSVVQAVLLVKHG