diff --git a/Align2/Sources/Profile.cc b/Align2/Sources/Profile.cc index 670ec84..694f0ff 100755 --- a/Align2/Sources/Profile.cc +++ b/Align2/Sources/Profile.cc @@ -89,7 +89,7 @@ namespace Victor { namespace Align2{ time_t t; pResetData(); seqLen = ali.getTarget().size(); - numSeq = ali.size() - 2; + numSeq = ali.size(); time(&t); newtime = localtime(&t); cout << "ready for pConstructData " << newtime->tm_hour << "/" << newtime->tm_min << endl; diff --git a/Makefile b/Makefile index c0f7aa4..7336999 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ # Path to project directory. UPDIR = . # Path to subdirectories. -SUBDIRS = tools Energy/Sources Biopool/Sources Align2/Sources Energy/Sources/TorsionPotential Lobo/Sources Lobo/APPS Energy/APPS Biopool/APPS Align2/APPS +SUBDIRS = tools Energy/Sources Biopool/Sources Align2/Sources Phylo/Sources Energy/Sources/TorsionPotential Lobo/Sources Lobo/APPS Energy/APPS Biopool/APPS Align2/APPS Phylo/APPS # # Libraries and paths (which are not defined globally). diff --git a/Makefile.global b/Makefile.global index 1f78a89..982f852 100644 --- a/Makefile.global +++ b/Makefile.global @@ -106,11 +106,11 @@ SUBDIRS += _dummy_ LIB_PATH += -L$(UPDIR)/lib -INC_PATH += -I$(UPDIR)/tools -I$(UPDIR)/Energy/Sources -I$(UPDIR)/Biopool/Sources -I$(UPDIR)/Energy/Sources/TorsionPotential -I$(UPDIR)/Lobo/Sources -I$(UPDIR)/Align2/Sources -I$(UPDIR)/Biopool/APPS -I$(UPDIR)/Energy/APPS -I$(UPDIR)/Align2/APPS -I$(UPDIR)/Lobo/APPS +INC_PATH += -I$(UPDIR)/tools -I$(UPDIR)/Energy/Sources -I$(UPDIR)/Biopool/Sources -I$(UPDIR)/Energy/Sources/TorsionPotential -I$(UPDIR)/Lobo/Sources -I$(UPDIR)/Align2/Sources -I$(UPDIR)/Phylo/Sources -I$(UPDIR)/Biopool/APPS -I$(UPDIR)/Energy/APPS -I$(UPDIR)/Align2/APPS -I$(UPDIR)/Phylo/APPS -I$(UPDIR)/Lobo/APPS ifdef test - INC_PATH += -I$(UPDIR)/Biopool/Tests -I$(UPDIR)/Energy/Tests -I$(UPDIR)/Align2/Tests -I$(UPDIR)/Lobo/Tests - SUBDIRS = Biopool/Tests Energy/Tests Align2/Tests Lobo/Tests + INC_PATH += -I$(UPDIR)/Biopool/Tests -I$(UPDIR)/Energy/Tests -I$(UPDIR)/Align2/Tests -I$(UPDIR)/Phylo/Tests -I$(UPDIR)/Lobo/Tests + SUBDIRS = Biopool/Tests Energy/Tests Align2/Tests Phylo/Tests Lobo/Tests endif ####### Implicit rules diff --git a/Phylo/APPS/Makefile b/Phylo/APPS/Makefile new file mode 100755 index 0000000..02a0733 --- /dev/null +++ b/Phylo/APPS/Makefile @@ -0,0 +1,55 @@ +#--*- makefile -*-------------------------------------------------------------- +# +# Standard makefile +# +#------------------------------------------------------------------------------ + +# Path to project directory +UPDIR = ../.. +# Path to subdirectories +SUBDIR = +# Path to directory for binaries +BINPATH = ../../bin + +# +# Libraries and paths (which are not defined globally) +# + +LIBS = -lPhylo -lAlign2 -ltools + +LIB_PATH = -L. + +INC_PATH = -I. -I../../tools/ -I../../Align2/Sources -I../../Phylo/Sources + +# +# Objects and headers +# + +SOURCES = phylogen.cc + +OBJECTS = phylogen.o + +TARGETS = phylogen + + +EXECS = phylogen + + +LIBRARY = APPSlibPhylo.a + +# +# Install rule +# + +compile: all + +all: install +install: $(LIBRARY) $(TARGETS) + mv $(LIBRARY) $(UPDIR)/lib/ + mv $(EXECS) $(BINPATH)/ + +# +# Call global Makefile to do the job +# + +include ../../Makefile.global diff --git a/Phylo/APPS/phylogen.cc b/Phylo/APPS/phylogen.cc new file mode 100755 index 0000000..e9fbc8d --- /dev/null +++ b/Phylo/APPS/phylogen.cc @@ -0,0 +1,256 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + +// --*- C++ -*------x------------------------------------------------------------------------------ +// +// +// Description: This program calculates phylogenetic trees starting from FASTA file. +// Choose from 2 different methos Neighbor joining and UPGMA. +// This program calculates ClustalW allignment and use NJ for create guide trees. +// -----------------x------------------------------------------------------------------------------ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +using namespace Victor::Align2; +using namespace Victor::Phylo; +using namespace Victor; +using namespace std; + + + +/// Show command line options and help text. + +void +sShowHelp() { + cout << "\nP Phylogenetic Tree GENERATOR" //This program read FASTA file and calculate phylogenetic tree. Use NWALIGN for align. + << "\nThis program read FASTA file and calculate phylogenetic tree. Use NWALIGN for align." + << "\nOptions:" + << "\n" + << "\n * [--in ] \t Name of input FASTA file" + << "\n [--out ] \t Name of output file (default = to screen)" + << "\n [-m ] \t Name of substitution matrix file (default = blosum62.dat)" + << "\n" + << "\n [-d ] \t Min. master vs all seq. id. filter threshold (suggested = 0.00)" + << "\n [-D ] \t Min. all vs all seq. id. filter threshold (suggested = 0.00)" + << "\n [-u ] \t Max. master vs all seq. id. filter threshold (suggested = 1.00)" + << "\n [-U ] \t Max. all vs all seq. id. filter threshold (suggested = 1.00)" + << "\n" + << "\n [--ws <0|1|2|3>] \t Weighting scheme for profiles (default = 0, i.e. no weighting scheme)" + << "\n \t --ws=0: No weighting scheme (default)." + << "\n \t --ws=1: Calculate a frequency profile or PSSM using Henikoff weighting scheme." + << "\n \t --ws=2: Calculate a frequency profile or PSSM using PSIC weighting scheme." + << "\n \t --ws=3: Calculate a frequency profile or PSSM using SeqDivergence weighting scheme." + << "\n" + << "\n [-o ] \t Open gap penalty (default = 12.00)" + << "\n [-e ] \t Extension gap penalty (default = 3.00)" + << "\n" + << "\n [--function <0|1|2>] \t CLuster function (default = 0, i.e. UPGMA)" + << "\n \t --gf=0: UPGMA (default)." + << "\n \t --gf=1: NJ." + << "\n \t --gf=2: ClustalW use NJ." + << "\n [-t ] \t For decide dimension of token, very important for ClustalW(10%size of first seq)" + << "\n [--cSeq ] \t Coefficient for sequence alignment (default = 1.0)" + << "\n" + << "\n [--ktuples] \t use ktuples method formula for calculate distance of pairwise sequence" + << "\n [--verbose] \t Verbose mode" + << "\n" << endl; +} + + +/// Show command line options and help text. + +int main(int argc, char **argv) { + + string inputFileName, outputFileName, matrixFileName, matrixStrFileName; + double openGapPenalty, extensionGapPenalty; + unsigned int function; + double cSeq; + double downs, downa, ups, upa; + bool verbose=false; + bool ktuples=false; + unsigned int weightingScheme; + int tokenSize; + + struct tm* newtime; + + time_t t; + + // -------------------------------------------------- + // 0. Treat options + // -------------------------------------------------- + if (getArg("h", argc, argv)) { + sShowHelp(); + return 1; + } + + + getArg("-in", inputFileName, argc, argv, "cyc1_input.fasta"); + getArg("-out", outputFileName, argc, argv, "!"); + + getArg("m", matrixFileName, argc, argv, "blosum62.dat"); + + getArg("-function", function, argc, argv, 0); + + getArg("-ws", weightingScheme, argc, argv, 0); + + getArg("o", openGapPenalty, argc, argv, 12.00); + getArg("e", extensionGapPenalty, argc, argv, 3.00); + + getArg("d", downs, argc, argv, 999.9); + getArg("D", downa, argc, argv, 999.9); + getArg("u", ups, argc, argv, 999.9); + getArg("U", upa, argc, argv, 999.9); + + getArg("t", tokenSize, argc, argv, -1); + + getArg("-cSeq", cSeq, argc, argv, 0.80); + + ktuples = getArg("-ktuples", argc, argv); + verbose = getArg("-verbose", argc, argv); + + // -------------------------------------------------- + // 1. Load FASTA + // -------------------------------------------------- + Alignment aliSec; + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + string dataPath = path + "data/"; + string finalPath=dataPath+inputFileName; + + cout<<"path "<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + tree.upgma(aliSec,ktuples,verbose); + cout<<"##########################################Tree create with PhyloTreeUPGMA#############################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + out=tree.printNewickTree(); + } + else if(function==1){ + cout<<"##########################################Starting PhyloTreeNJ########################################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + tree.neighborJoining(aliSec,ktuples,verbose); + cout<<"##########################################Tree create with PhyloTreeNJ################################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + out=tree.printNewickTree(); + } + else if(function==2){ + cout<<"##########################################Starting PhyloTreeNJ########################################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + tree.neighborJoining(aliSec,ktuples,verbose); + cout<<"##########################################Guide Tree create with PhyloTreeNJ################################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + out=tree.printNewickTree(); + cout<<"##########################################Starting ClustalW###############################"<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + cout<<"number of leaf from main "<tm_hour << " hour " << newtime->tm_min<<" min "<tm_sec<<" sec "<< endl; + + } + else + { + cout<<"invalid cluster, select [--cluster <0|1>]"; + } + + // -------------------------------------------------- + // 2. Output Tree + // -------------------------------------------------- + if(outputFileName=="!"){ + cout<<"Print the Tree "<. + */ + +// --*- C++ -*------x----------------------------------------------------------- +// +// +// Description: ClustalW allignemt. +// http://www.ebi.ac.uk/Tools/msa/clustalw2/ +// +// -----------------x----------------------------------------------------------- + + +// Includes: +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +// Global constants, typedefs, etc. (to avoid): +using namespace Victor::Align2; +using namespace Victor; +using namespace std; + +namespace Victor { namespace Phylo{ + + // CONSTRUCTORS: + + ClustalW::ClustalW(){ + score=0; + tokenSize=0; + } + + ClustalW::ClustalW(NewickTree gT) { + + cout<<"constructor"< tmpV(2); + int tokenSize=PhyloSupport::tokenSize; + + + vector nodeTree(guideTree.getNumberOfLeaf()); + for(unsigned int i=0;iseq.size()/10; + cout<<"Use Default token Size "< seqV(1); + vector weigthV(1); + vector nameV(1); + vector nameFinal(guideTree.getNumberOfLeaf()); + cout<<"percent to the end:"<1){ + cout<<""<<((j)*100/(guideTree.getNumberOfLeaf())*100)/100<<"% "<allignSeq[0].size()allignSeq[0].size()){ + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[0]->allignSeq,nodeTree[1]->allignSeq,nodeTree[0]->weigthV,nodeTree[1]->weigthV,false,tokenSize); + //insert name + nodeTree[0]->nameV=PhyloSupport::mergeStringVector(nodeTree[0]->nameV,nodeTree[1]->nameV); + //insert weigth + nodeTree[0]->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[0]->weigthV,nodeTree[1]->weigthV); + } + else { + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[1]->allignSeq,nodeTree[0]->allignSeq,nodeTree[1]->weigthV,nodeTree[0]->weigthV,false,tokenSize); + //insert name + nodeTree[0]->nameV=PhyloSupport::mergeStringVector(nodeTree[1]->nameV,nodeTree[0]->nameV); + //insert weigth + nodeTree[0]->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[1]->weigthV,nodeTree[0]->weigthV); + } + nodeTree[1]->ClustalW=true; + nodeTree[0]->allignSeq=tmpV; + + } + else if(j==1){//populate nodeTree + seqV[0]=nodeTree[i]->seq; + weigthV[0]=nodeTree[i]->weigth; + nameV[0]=nodeTree[i]->name; + nodeTree[i]->nameV=nameV; + nodeTree[i]->allignSeq=seqV; + nodeTree[i]->weigthV=weigthV; + } + else if(j==2 && nodeTree[i]->parent->numberOfChildLeaf==2){//first lap of align only seq VS seq + tmpV=PhyloSupport::AlingSvsS(nodeTree[i]->seq,nodeTree[i]->seq); + weigthV[0]=nodeTree[i]->weigth; + nameV[0]=nodeTree[i]->name; + if(weigthV.size()==1) + weigthV.push_back(nodeTree[i+1]->weigth); + else + weigthV[1]=nodeTree[i+1]->weigth; + if(nameV.size()==1) + nameV.push_back(nodeTree[i+1]->name); + else + nameV[1]=nodeTree[i+1]->name; + + nodeTree.erase(nodeTree.begin()+i+1); + nodeTree[i]->parent->allignSeq=tmpV; + nodeTree[i]->parent->weigthV=weigthV; + nodeTree[i]->parent->nameV=nameV; + nodeTree[i]=nodeTree[i]->parent; + } + else if(j>2 && nodeTree[i]->parent->numberOfChildLeaf==(int)j && !nodeTree[i]->ClustalW && iisLeft){ + if(nodeTree[i]->allignSeq[0].size()parent->right->allignSeq[0].size()) { + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[i]->allignSeq,nodeTree[i]->parent->right->allignSeq,nodeTree[i]->weigthV,nodeTree[i]->parent->right->weigthV,false,tokenSize); + //insert name + nodeTree[i]->parent->nameV=PhyloSupport::mergeStringVector(nodeTree[i]->nameV,nodeTree[i]->parent->right->nameV); + //insert weigth + nodeTree[i]->parent->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[i]->weigthV,nodeTree[i]->parent->right->weigthV); + } + else { + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[i]->parent->right->allignSeq,nodeTree[i]->allignSeq,nodeTree[i]->parent->right->weigthV,nodeTree[i]->weigthV,false,tokenSize); + //insert name + nodeTree[i]->parent->nameV=PhyloSupport::mergeStringVector(nodeTree[i]->parent->right->nameV,nodeTree[i]->nameV); + //insert weigth + nodeTree[i]->parent->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[i]->parent->right->weigthV,nodeTree[i]->weigthV); + } + nodeTree[i]->parent->right->ClustalW=true; + } + else{//is right child + if(nodeTree[i]->allignSeq[0].size()parent->left->allignSeq[0].size()) { + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[i]->allignSeq,nodeTree[i]->parent->left->allignSeq,nodeTree[i]->weigthV,nodeTree[i]->parent->right->weigthV,false,tokenSize); + //insert name + nodeTree[i]->parent->nameV=PhyloSupport::mergeStringVector(nodeTree[i]->nameV,nodeTree[i]->parent->left->nameV); + //insert weigth + nodeTree[i]->parent->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[i]->weigthV,nodeTree[i]->parent->left->weigthV); + } + else { + tmpV=PhyloSupport::AlingMultiSvsMultiS(nodeTree[i]->parent->left->allignSeq,nodeTree[i]->allignSeq,nodeTree[i]->parent->right->weigthV,nodeTree[i]->weigthV,false,tokenSize); + //insert name + nodeTree[i]->parent->nameV=PhyloSupport::mergeStringVector(nodeTree[i]->parent->left->nameV,nodeTree[i]->nameV); + //insert weigth + nodeTree[i]->parent->weigthV=PhyloSupport::mergeDoubleVector(nodeTree[i]->parent->left->weigthV,nodeTree[i]->weigthV); + } + nodeTree[i]->parent->left->ClustalW=true; + } + nodeTree[i]=nodeTree[i]->parent; + nodeTree[i]->allignSeq=tmpV; + } + else if(nodeTree[i]->ClustalW && iallignSeq,nodeTree[0]->nameV); + + cout<allignSeq,nodeTree[0]->weigthV)); + cout< seq Vector of sequence. + *@return string txt Correct rappresetation in CLustalW format(50 char by lane) + */ + string ClustalW::printClustalWFromat(vector seq){ + vector names(seq.size()); + for(unsigned int i=0; i seq Vector of sequence. + *@return string txt Correct rappresetation in CLustalW format(50 char by lane) + */ + string ClustalW::printClustalWFromat(vector seq, vector names){ + string txt="CLUSTALW \n\n"; + string tmp=""; + unsigned int j=0; + unsigned int index=0; + unsigned int dim=50; + vector countSimb(seq.size(),0); + bool onlyGap=true; + while(j seq Vector of sequence. + *@return double score the score of similarity. + */ + double ClustalW::scoreClustalW(vector allignSeq){ + //matrix config + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //Default matrix + string matrixFileName=PhyloSupport::matrix; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + //end matrix config + double score=0; + for(unsigned int x=0;x seq Vector of sequence. + *@param vector w vector of weigth from guide tree + *@return double score the score of similarity. + */ + double ClustalW::scoreClustalW(vector allignSeq, vector w){ + //matrix config + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //Default matrix + string matrixFileName=PhyloSupport::matrix; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + //end matrix config + double score=0; + for(unsigned int x=0;x. + */ + + +#ifndef _ClustalW_h_ +#define _ClustalW_h_ + +// Includes: + +#include +#include +#include +#include + + +using std::string; +using std::vector; + + +// Global constants, typedefs, etc. (to avoid): +namespace Victor { namespace Phylo { + + /**@brief Methods to manages the global statistic data. + * + *@Description Use for ClustalW + * */ + class ClustalW { + + public: + + // CONSTRUCTORS + // void constructor. + ClustalW(); + // starting from guide Tree constructor. + ClustalW(NewickTree gT); + + //DESTRUCTOR: + ~ClustalW(); + + + // HELPERS: + //Generate and return string of sequenze in format .ClustalW + static string printClustalWFromat(vector seq); + //Generate and return string of sequenze in format .ClustalW with name + static string printClustalWFromat(vector seq, vector names); + //calculate score based on similairty of all sequence in CLustalW allign no weigth consider + double scoreClustalW(vector allignSeq); + //calculate score based on similairty of all sequence in CLustalW allign with weigth + double scoreClustalW(vector allignSeq, vector w); + + // OPERATORS: + void setTokenSize(unsigned int t); + void setScore(double s); + + // OPERATORS: + unsigned int getTokenSize(); + double getScore(); + + protected: + // OPERATORS: + //Apply ClustalW using Guide Tree + void progressiveAlign(); + // ATTRIBUTES: + unsigned int tokenSize; + double score; + + + + // ATTRIBUTES: + NewickTree guideTree; + + + + private: + // HELPERS: + + // ATTRIBUTES: + + + }; + + +}} +#endif + diff --git a/Phylo/Sources/Makefile b/Phylo/Sources/Makefile new file mode 100755 index 0000000..645c3f3 --- /dev/null +++ b/Phylo/Sources/Makefile @@ -0,0 +1,51 @@ +#--*- makefile -*-------------------------------------------------------------- +# +# Standard makefile +# +#------------------------------------------------------------------------------ + +# Path to project directory +UPDIR = ../.. +# Path to subdirectories +SUBDIR = +# Path to directory for binaries +BINPATH = ../../bin + +# +# Libraries and paths (which are not defined globally) +# + +LIBS = -lAlign2 -ltools + +LIB_PATH = -L. + +INC_PATH = -I. -I../../tools/ -I../../Align2/Sources + +# +# Objects and headers +# + +SOURCES = PhyloSupport.cc NewickTree.cc ClustalW.cc SeqNodeGraph.cc + +OBJECTS = PhyloSupport.o NewickTree.o ClustalW.o SeqNodeGraph.o + +TARGETS = + +EXECS = + +LIBRARY = libPhylo.a + +# +# Install rule +# + +compile: all + +install: $(LIBRARY) $(TARGETS) + mv $(LIBRARY) $(UPDIR)/lib + +# +# Call global Makefile to do the job +# +all: install +include ../../Makefile.global diff --git a/Phylo/Sources/NewickTree.cc b/Phylo/Sources/NewickTree.cc new file mode 100644 index 0000000..8b1bf45 --- /dev/null +++ b/Phylo/Sources/NewickTree.cc @@ -0,0 +1,784 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + + +// Includes: +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +// Global constants, typedefs, etc. (to avoid): +using namespace Victor::Align2; +using namespace Victor; +using namespace std; + +namespace Victor { namespace Phylo{ + + // CONSTRUCTORS: + + +std::vector distance; + +/** + *@Description for create void node + */ + NewickTree::NewickTree() { + //root->weigthV; + //root->nameV; + //root->allignSeq; + root=new iNode; + root->name="NULL"; + root->left=NULL; + root->right=NULL; + root->parent=NULL; + root->seq="noPURESeq"; + root->branchLength=-1; + root->divergenceR=0; + root->nseq=-1; + root->ClustalW=false; + root->isRigth=false; + root->isLeft=false; + + root->isLeaf=true; + root->numberOfChildLeaf=0; + } + + /** + *@Description for create node of tree + *@param int position numeber of sequenze in FASTA file + *@param string name name of sequence in FASTA file + *@param string pureSeq the Sequence with no modification like in FASTA file + *@param double divR divergence usefull inNJ method + */ + NewickTree::NewickTree(int position,string name,string pureSeq,double divR){ + //not need + //root->allignSeq; + //root->nameV; + //root->weigthV; + //void + root=new iNode; + root->left=NULL; + root->right=NULL; + root->parent=NULL; + root->branchLength=-1; + root->ClustalW=false; + root->isRigth=false; + root->isLeft=false; + //important + root->name=name; + root->divergenceR=divR; + root->seq=pureSeq; + root->nseq=position; + root->isLeaf=true; + root->numberOfChildLeaf=0; + } + + /** + *@Description for create node of tree + *@param int position numeber of sequenze in FASTA file + *@param string name name of sequence in FASTA file + *@param string pureSeq the Sequence with no modification like in FASTA file + */ + NewickTree::NewickTree(int position,string name,string pureSeq){ + //void + //root->weigthV; + //root->nameV; + //root->allignSeq; + root=new iNode; + root->left=NULL; + root->right=NULL; + root->parent=NULL; + root->branchLength=0; + root->ClustalW=false; + root->isRigth=false; + root->isLeft=false; + //important + root->name=name; + root->divergenceR=0; + root->seq=pureSeq; + root->nseq=position; + root->isLeaf=true; + root->numberOfChildLeaf=0; + } + + + + /** + *@Description Create new Tree. Used for union two different Tree. + *@param NewickTree *rTree the right child + *@param NewickTree *lTree the left child + */ + NewickTree::NewickTree( NewickTree *rTree, NewickTree *lTree) { + //not need + //root->weigthV; + //root->nameV; + //root->allignSeq; + //void + root=new iNode; + root->branchLength=-1; + root->ClustalW=false; + root->isRigth=false; + root->isLeft=false; + //important + root->isLeaf=false;//internal + int tmp=0; + if(rTree->isLeaf()) + tmp++; + if(lTree->isLeaf()) + tmp++; + root->numberOfChildLeaf=tmp+rTree->getNumOfChildren()+lTree->getNumOfChildren(); + root->name=""; + root->seq=""; + + rTree->getRoot()->isRigth=true; + root->right=rTree->getRoot(); + + lTree->getRoot()->isLeft=true; + root->left=lTree->getRoot(); + + root->parent=NULL; + + root->divergenceR=0; + } + + + /** + *@Description Basic destructor + */ + NewickTree::~NewickTree() { + } + + + // PREDICATES: + iNode *NewickTree::getRoot(){ + return root; + } + + iNode *NewickTree::getRightChild(){ + return root->right; + } + + iNode*NewickTree::getLeftChild(){ + return root->left; + } + iNode*NewickTree::getParent(){ + return root->parent; + } + + string NewickTree::getName(){ + return root->name; + } + + string NewickTree::getPureSeq(){ + return root->seq; + } + + + double NewickTree::getRdiv(){ + return root->divergenceR; + } + + + bool NewickTree::isLeaf(){ + return root->isLeaf; + } + + int NewickTree::getNumOfChildren(){ + return root->numberOfChildLeaf; + } + + double NewickTree::getWeigth(){ + return root->weigth; + } + + NewickTree NewickTree::getLeafInPosition(unsigned int index){ + return leafList[index]; + } + + unsigned int NewickTree::getNumberOfLeaf(){ + return leafList.size(); + } + + // OPERATORS: + /** + *@Description for create phylogenic Tree use NJ method + *@param Align2::Alignment ali alignament of starting sequence + *@param bool ktuples decide to use ktuples method for calculate identity of 2 sequence + *@param bool verbose for more information on video during computing. + */ + void NewickTree::neighborJoining(Align2::Alignment ali,bool ktuples, bool verbose){ + + int Nseq=ali.size()+1;//number of seq + vector > distance(Nseq, std::vector(Nseq, 0)); + + if(verbose) + cout<<"---------Creating Distance Matrix--------------/"<vet=PhyloSupport::calcAlignmentV(&ali,distance,ktuples,verbose); + + + + if (verbose){ + cout<<"---------Print Distance Matrix--------------/"< trees(Nseq); + + //Create all node + for(unsigned int i=0;i seqList(trees.size());//never modify, used after create guide tree + int count=0;//count number of seq insert in seqList + + + //memorize the nearest seq + unsigned int minj=0; + unsigned int mini=1; + //vector for new distance + vector newMediaD ( trees.size()); + + //for creat new matrix; + unsigned int newi=0; + unsigned int newj=0; + + + while(trees.size()!=3){// end 3 + if(verbose) + cout<<"----------------While num= "<<(Nseq-trees.size())<<"---------------"< > Mij(trees.size(), std::vector(trees.size(), 0)); + + for(unsigned int i=0; i Mij[i][j] && i!=j) + { + minj=j; + mini=i; + } + } + } + + if(verbose) + cout< > tmpdistance(trees.size()-1, std::vector(trees.size()-1, 0)); + + //reset + newi=0; + newj=0; + //copy old matrix in new matrix, but not mini (A) and minj (B) and not the new node (U) + for(unsigned int i=0; imini && imini && i>minj){ + newi=i-2; + } + if(jmini && jmini && j>minj){ + newj=j-2; + } + if(verbose) + cout<mini && imini && i>minj){ + newi=i-2; + } + tmpdistance[newi][tmpdistance.size()-1]=(distance[mini][i]+distance[minj][i]-distance[mini][minj])/2; + if(verbose) + cout<setBranchLength((distance[1][2]/2+distance[0][2]/2)/2-tmpT->getValueOFChildBranchLength()); + tmpT=new NewickTree(tmpT,&trees[2]); + + //parent + trees[2].setParent(*tmpT); + + } + else{//min distance 0,2 + trees[0].setBranchLength(distance[0][2]/2-trees[0].getValueOFChildBranchLength()); + trees[2].setBranchLength(distance[0][2]/2-trees[2].getValueOFChildBranchLength()); + tmpT=new NewickTree(&trees[0],&trees[2]); + //parent + trees[0].setParent(*tmpT); + trees[2].setParent(*tmpT); + + trees[1].setBranchLength((distance[1][2]/2+distance[1][0]/2)/2-trees[1].getValueOFChildBranchLength()); + tmpT->setBranchLength((distance[1][2]/2+distance[1][0]/2)/2-tmpT->getValueOFChildBranchLength()); + tmpT=new NewickTree(tmpT,&trees[1]); + //parent + trees[1].setParent(*tmpT); + + } + + } + else{//min distance 1,2 + trees[1].setBranchLength(distance[1][2]/2-trees[1].getValueOFChildBranchLength()); + trees[2].setBranchLength(distance[1][2]/2-trees[2].getValueOFChildBranchLength()); + tmpT=new NewickTree(&trees[1],&trees[2]); + //parent + trees[1].setParent(*tmpT); + trees[2].setParent(*tmpT); + + trees[0].setBranchLength((distance[0][2]/2+distance[0][1]/2)/2-trees[0].getValueOFChildBranchLength()); + tmpT->setBranchLength((distance[0][1]/2+distance[0][2]/2)/2-tmpT->getValueOFChildBranchLength()); + tmpT=new NewickTree(tmpT,&trees[0]); + //parent + trees[0].setParent(*tmpT); + + } + + if(trees[0].isLeaf()){ + seqList[count]=trees[0]; + trees[0].setPosition(count); + count++; + } + if(trees[1].isLeaf()){ + seqList[count]=trees[1]; + trees[1].setPosition(count); + count++; + } + if(trees[2].isLeaf()){ + seqList[count]=trees[2]; + trees[2].setPosition(count); + count++; + } + + //calculate weigth + //The weights are dependent upon the distance from the root of the tree + //but sequences which have a common branch with other sequences share the weight + //derived from the shared branch + if(verbose) + cout<getRoot(); + + + }//end method + + + // OPERATORS: + /** + *@Description for create phylogenic Tree use UPGMA method + *@param Align2::Alignment ali alignament of starting sequence + *@param bool ktuples decide to use ktuples method for calculate identity of 2 sequence + *@param bool verbose for more information on video during computing. + */ + void NewickTree::upgma(Align2::Alignment ali,bool ktuples, bool verbose){ + + int Nseq=ali.size()+1;//number of seq + vector > distance(Nseq, std::vector(Nseq, 0)); + + vectorvet=PhyloSupport::calcAlignmentV(&ali,distance,ktuples,verbose); + vector trees(Nseq); + + + if(verbose){ + cout<<"---------Print Distance Matrix--------------/"< newMediaD ( trees.size()); + + //for creat new matrix; + unsigned int newi=0; + unsigned int newj=0; + + + while(trees.size()!=1){ + + mini=0; + minj=1; + //search nearest seq + for(unsigned int i=0; i distance[i][j] && i!=j) + { + minj=j; + mini=i; + } + } + } + + if(verbose) + cout< > tmpdistance(trees.size()-1, std::vector(trees.size()-1, 0)); + newi=0; + newj=0; + + //copy old matrix in new matrix, but not mini (A) and minj (B) and not the new node (U) + for(unsigned int i=0; imini && imini && i>minj){ + newi=i-2; + } + if(jmini && jmini && j>minj){ + newj=j-2; + } + if(verbose) + cout<<"mini e minj "<mini && imini && i>minj){ + newi=i-2; + } + tmpdistance[newi][tmpdistance.size()-1]=(distance[mini][i]+distance[minj][i])/2; + if(verbose) + cout<branchLength; + if(seq->parent!=NULL){ + w+=( calcWeigth(seq->parent)/(seq->parent->numberOfChildLeaf) ); + } + return w; + } + + + /** + *@Description Return max values of this node children branchLength. + * Return zero if left child and right are NULL + */ + + double NewickTree::getValueOFChildBranchLength(){ + double value1=0; + double value2=0; + if(root->left!=NULL) + { + value1=root->left->branchLength; + } + if(root->right!=NULL) + { + value2=root->right->branchLength; + } + if(value1>value2) + return value1; + return value2; + } + + + /** + * Set the value of node if is -1 its escape during print + * @param val + * + */ + void NewickTree::setBranchLength(double val){ + if(val>0) + root->branchLength=val; + else + root->branchLength=0; + } + + /** + * Set the value of divergence will use in NJ if is -1 its escape during print + * @param val + * + */ + void NewickTree::setRdiv(double val){ + root->divergenceR=val; + } + + + void NewickTree::setParent(NewickTree pTree) { + root->parent=pTree.getRoot(); + } + + void NewickTree::setWeigth(double w){ + root->weigth=w; + } + + + void NewickTree::setPosition(unsigned int pos){ + root->nseq=pos; + } + + void NewickTree::setAlingSeq(vector vSeq){ + root->allignSeq=vSeq; + } + + +}} // namespace diff --git a/Phylo/Sources/NewickTree.h b/Phylo/Sources/NewickTree.h new file mode 100644 index 0000000..2e01377 --- /dev/null +++ b/Phylo/Sources/NewickTree.h @@ -0,0 +1,132 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + + +#ifndef _NewickTree_h_ +#define _NewickTree_h_ + +// Includes: + +#include +#include +#include +#include + + +using std::string; +using std::vector; + +//rappresent the node of a phylogenic tree +struct iNode +{ + string name; + int nseq; + string seq;//pure seq + vector allignSeq; + bool isRigth; + bool isLeft; + iNode *left; + iNode *right; + iNode *parent; + int numberOfChildLeaf; + bool isLeaf; + double branchLength; + double divergenceR;//for NJ + double weigth;//for ClustalW + vector weigthV;//for ClustalW + vector nameV;//for ClustalW + bool ClustalW;//for ClustalW + +}; + +// Global constants, typedefs, etc. (to avoid): +namespace Victor { namespace Phylo { + + /**@brief Methods to manages the global statistic data. + * + *@Description Use for create Tree during phylogeny + * */ + class NewickTree { + public: + + // CONSTRUCTORS/DESTRUCTOR: + NewickTree(); + NewickTree(int position,string name,string pureSeq,double divR); + NewickTree(NewickTree *rTree, NewickTree *lTree); + NewickTree(int position,string name,string pureSeq); + + ~NewickTree(); + + + // PREDICATES: + iNode *getRoot(); + iNode *getRightChild(); + iNode*getLeftChild(); + iNode*getParent(); + + string getName(); + string getPureSeq(); + + double getValueOFChildBranchLength(); + double getRdiv(); + double getWeigth(); + + bool isLeaf(); + + int getNumOfChildren(); + + unsigned int getNumberOfLeaf(); + + NewickTree getLeafInPosition(unsigned int index); + + + + + // OPERATORS: + void setBranchLength(double val); + void neighborJoining(Align2::Alignment ali,bool ktuples=false, bool verbose=false); + void upgma(Align2::Alignment ali, bool ktuples=false, bool verbose=false); + void setRdiv(double val); + void setParent(NewickTree pTree); + void setWeigth(double w); + void setPosition(unsigned int pos); + void setAlingSeq(vector vSeq); + + + // Print in a string the tree using Newick format + string printNewickTree(); + + protected: + // HELPERS: + string printNewickTreeNode(iNode node); + double calcWeigth(iNode* seq); + + // ATTRIBUTES: + iNode *root; + vector leafList; + + + private: + // HELPERS: + void destroy_tree(iNode *leaf); + + + }; + + +}} +#endif + diff --git a/Phylo/Sources/PhyloSupport.cc b/Phylo/Sources/PhyloSupport.cc new file mode 100644 index 0000000..9f17d81 --- /dev/null +++ b/Phylo/Sources/PhyloSupport.cc @@ -0,0 +1,551 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + + +// Includes: +#include +#include +#include +#include +#include "PhyloSupport.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Global constants, typedefs, etc. (to avoid): +using namespace Victor::Align2; +using namespace Victor; +using std::string; +using std::vector; +using std::ifstream; +using std::cout; +using std::endl; + + +namespace Victor { namespace Phylo{ + + + /** @brief + * + **/ + // CONSTRUCTORS/DESTRUCTOR: + + /** + *@Description Basic contructor + */ + PhyloSupport::PhyloSupport(){ + + } + + + /** + *@Description Basic destructor + */ + PhyloSupport::~PhyloSupport() { + + } + + // ATTRIBUTES: + double PhyloSupport::openGapPenalty=12; + double PhyloSupport::extensionGapPenalty=3; + double PhyloSupport::cSeq=0.8; + double PhyloSupport::downs=999.9; + double PhyloSupport::downa=999.9; + double PhyloSupport::ups=999.9; + double PhyloSupport::upa=999.9; + int PhyloSupport::tokenSize=-1; + unsigned int PhyloSupport::weightingScheme=0; + string PhyloSupport::matrix="blosum62.dat"; + bool PhyloSupport::verbose=false; + + + vector PhyloSupport::calcAlignmentV(Alignment *aliSec, vector > &distance , bool ktuples,bool verbose){ + string seq1Name, seq2Name, seq1, seq2, sec1, sec2; + + Alignment newAli; + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //n=4? no idea. Default in APP/subali.cc + int n=4; + + AlignmentData *ad; + + //Default matrix + string matrixFileName=PhyloSupport::matrix; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + + //Default gap function + + GapFunction *gf = new AGPFunction(openGapPenalty, extensionGapPenalty); + + + //Default Structure + Structure *str; + str = 0; + + //Default ScoringScheme + ScoringScheme *ss; + + //Global Align + Align *a; + if(verbose) + cout << "Start Suboptimal Needleman-Wunsch alignments:" << endl; + + double suboptPenaltyMul=1; + double suboptPenaltyAdd=1; + + unsigned int suboptNum=1; + + vector a2; + + Alignment a3; + + vector alignV(aliSec->size()+1);//vector of align + + double score=0; + //size not count target. so +1 + for(unsigned int index=0;indexsize()+1;index++){ + if(verbose) + cout<<" index="<size()){ + seq1 = Alignment::getPureSequence(aliSec->getTarget()); + seq1Name = aliSec->getTargetName(); + } + else{ + seq1 = Alignment::getPureSequence(aliSec->getTemplate(index)); + seq1Name = aliSec->getTemplateName(index); + } + for(unsigned int j=0;jsize()+1;j++){ + if(verbose) + cout<<" j="<size()){ + seq2 = Alignment::getPureSequence(aliSec->getTarget()); + seq2Name = aliSec->getTargetName(); + }else{ + seq2 = Alignment::getPureSequence(aliSec->getTemplate(j)); + seq2Name = aliSec->getTemplateName(j); + } + + ad = new SequenceData(n, seq1, seq2, seq1Name, seq2Name); + + ss = new ScoringS2S(&sub, ad, str, cSeq); + + a = new NWAlign(ad, gf, ss); + + a->setPenalties(suboptPenaltyMul, suboptPenaltyAdd); + a2 = a->generateMultiMatch(suboptNum); + if (a2.size() == 0) + ERROR("No output alignments generated.", exception); + + a2[0].cutTemplate(1); + if(ktuples==true) + { + score=PhyloSupport::distanceCalcTwoSeqktuples(a2[0].getTarget(),a2[0].getTemplate(0)); + } + else + score=PhyloSupport::distanceCalcTwoSeq(a2[0].getTarget(),a2[0].getTemplate(0)); + distance[index][j]=score; + if(verbose) + cout<<"SCORE="< PhyloSupport::AlingSvsS(string seq1,string seq2,bool verbose){ + + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //n=4? no idea. Default in APP/subali.cc + int n=4; + + AlignmentData *ad; + + //Default matrix + string matrixFileName=PhyloSupport::matrix; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + + //Default gap function + + GapFunction *gf = new AGPFunction(openGapPenalty, extensionGapPenalty); + + + //Default Structure + Structure *str; + str = 0; + + //Default ScoringScheme + ScoringScheme *ss; + + //Global Align + Align *a; + if(verbose) + cout << "Start Suboptimal Needleman-Wunsch alignments:" << endl; + + double suboptPenaltyMul=1; + double suboptPenaltyAdd=1; + + unsigned int suboptNum=1; + + vector a2; + + + ad = new SequenceData(n, seq1, seq2, "seq1", "seq2"); + + ss = new ScoringS2S(&sub, ad, str, cSeq); + + a = new NWAlign(ad, gf, ss); + + a->setPenalties(suboptPenaltyMul, suboptPenaltyAdd); + a2 = a->generateMultiMatch(suboptNum); + if (a2.size() == 0) + ERROR("No output alignments generated.", exception); + vector result(2); + + result[0]=a2[0].getTarget(); + result[1]=a2[0].getTemplate(); + if(verbose){ + cout<<"result "< PhyloSupport::AlingMultiSvsMultiS(vector seq1,vector seq2,vector vWeigth1,vector vWeigth2,bool verbose,int tokenSize){ + + string gap="-"; + string tokenSizeGap=""; + string tokenSizeGap2=""; + int tokenSize2=tokenSize; + + //insert gap at end for string with same size//not need + while(seq1[0].size()>seq2[0].size()){ + for(unsigned int i=0; i tokenS1(seq1[0].size()/tokenSize); + vector tokenS2(seq2[0].size()/tokenSize2); + + for(unsigned int i=0; i tempSV(seq1.size()); + for(unsigned int j=0; j tempSV(seq2.size()); + for(unsigned int j=0; j edgeFor1(tokenS1.size()+tokenS2.size(),-1); + vector finalS(seq1.size()+seq2.size(),""); + unsigned int count1=0; + unsigned int count2=0; + while(count1returnBestEdge(); + count2=edgeFor1[count1]; + count1++; + } + else { + edgeFor1[count1]=tokenS1[count1]->returnBestEdgeAfterIndex(count2); + count2=edgeFor1[count1]; + count1++; + } + + } + + + //insert in final vector of string + for(unsigned int i=0; igetTokenSeq(x); + }//if + else{ + for(int j=0; jgetTokenSeq(x); + } + }//end for x + }//end if + else{ + for(unsigned x=0;xgetTokenSeq(x); + } + } + }//ed for i + + for(unsigned int i=0; ifinalS[seq1.size()].size()){ + for(unsigned int i=0; i &PhyloSupport::split(const std::string &s, char delim, std::vector &elems) { + std::stringstream ss(s); + std::string item; + while (std::getline(ss, item, delim)) { + elems.push_back(item); + } + return elems; + } + + std::vector PhyloSupport::split(const std::string &s, char delim) { + std::vector elems; + split(s, delim, elems); + return elems; + } + + + double PhyloSupport::calcDivR(vector vDist){ + double tmpR=0; + for(unsigned int i=0;i > &distance){ + string sMatrix=""; + //std::setprecision(3)//for minus precision + for(unsigned int i=0; i PhyloSupport::mergeStringVector( vector v1,vector v2 ){ + vector v(v1.size()+v2.size()); + for(unsigned int i=0; i PhyloSupport::mergeDoubleVector( vector v1,vector v2 ){ + vector v(v1.size()+v2.size()); + for(unsigned int i=0; i. + */ + + +#ifndef _PhyloSupport_h_ +#define _PhyloSupport_h_ + +// Includes: + +#include +#include +#include +#include + +using namespace Victor::Align2; +using std::string; +using std::vector; + + +// Global constants, typedefs, etc. (to avoid): +namespace Victor { namespace Phylo { + + /**@brief Methods to manages the global statistic data. + * + *@Description Use for create Tree during phylogeny and controll the parameter in NJ, UPGMA and CLUSTALW + * */ + class PhyloSupport { + public: + + // CONSTRUCTORS/DESTRUCTOR: + PhyloSupport(); + + + ~PhyloSupport(); + + + //STATIC: + + //Calc multi Align + static vector calcAlignmentV(Alignment *aliSec, vector > &distance, bool ktuples=false, bool verbose=false); + //Calc distance from 2 seq of DNA or protein + static double distanceCalcTwoSeq(string seq1,string seq2); + //Calc distance from 2 seq of DNA or protein use k-tuples method + static double distanceCalcTwoSeqktuples(string seq1,string seq2); + //for split string + static std::vector &split(const std::string &s, char delim, std::vector &elems); + static std::vector split(const std::string &s, char delim); + //for calculate divergent usefull for NJ + static double calcDivR(vector vDist); + //for print matrix of double + static void printMatrix(vector > &distance); + //aling 2 sequence + static vector AlingSvsS(string seq1,string seq2,bool verbose=false); + //aling 2 or more sequence use method of graph + static vector AlingMultiSvsMultiS(vector seq1,vector seq2,vector vWeigth1,vector vWeigth2,bool verbose=false,int tokenSize=1); + //insert a gap in certain position in a string + static string insertGapPosition(string seq, int position); + //cast a int to string + static string intToString( int num ); + //to merge 2 vector of string, order of merge is v=v1[0],v1[1]....v2[0]....v2[n-1] + static vector mergeStringVector(vector v1,vector v2 ); + //to merge 2 vector of double, order of merge is v=v1[0],v1[1]....v2[0]....v2[n-1] + static vector mergeDoubleVector(vector v1,vector v2 ); + + //static parameter for NJ UPGMA and ClustalW: + + //gap for align + static double openGapPenalty; + static double extensionGapPenalty; + //Cseq for align + static double cSeq; + //for align + static double downs, downa, ups, upa; + static unsigned int weightingScheme; + //ClustalW + static int tokenSize; + //Substitution matrix + static string matrix; + //verbose output + static bool verbose; + + + + protected: + + + private: + + + }; + + +}} +#endif + diff --git a/Phylo/Sources/SeqNodeGraph.cc b/Phylo/Sources/SeqNodeGraph.cc new file mode 100644 index 0000000..3a73cae --- /dev/null +++ b/Phylo/Sources/SeqNodeGraph.cc @@ -0,0 +1,292 @@ +/* This file is part of Victor. + + Victor is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Victor is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Victor. If not, see . + */ + + +// Includes: +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +// Global constants, typedefs, etc. (to avoid): +using namespace Victor::Align2; +using namespace Victor; +using namespace std; + +namespace Victor { namespace Phylo{ + + // CONSTRUCTORS: + + /** + *@Description Basic constructor, void node + * + */ + SeqNodeGraph::SeqNodeGraph(){ + indexStart=0; + indexFinish=0; + totNumSeq=0; + vector taxV(0); + taxEdge=taxV; + tokenSize=0; + numSeq=0; + averageTax=0; + numTokenInSeq2=0; + } + + + + //totNumSeq how much of seq in this node minimun 1 + //numSeqInS2 how much seq in the seq2 important for size of vector TaxEge; + //Weigth depends on guide tree; + + /** + *@Description Basic constructor + *@param totNumS how much of seq in this node minimun 1 + *@param numSeqInS2 how much seq in the seq2 important for size of vector TaxEge; + *@param Weigth depends on guide tree + *@param indexS start index + *@param indexF final index + * + */ + + SeqNodeGraph::SeqNodeGraph(unsigned int indexS,unsigned int indexF, unsigned int totNumS,vector tokenS,unsigned int numSeqInS2){ + indexStart=indexS; + indexFinish=indexF; + totNumSeq=totNumS; + vector taxV(numSeqInS2,0); + taxEdge=taxV; + tokenSeq=tokenS; + tokenSize=tokenSeq[0].size(); + numSeq=tokenSeq.size(); + averageTax=0; + numTokenInSeq2=numSeqInS2; + } + + + /** + *@Description Basic destructor + */ + SeqNodeGraph::~SeqNodeGraph() { + } + + // PREDICATES: + /** + *@Description find and return the best edge of node + * + */ + unsigned int SeqNodeGraph::returnBestEdge(){ + unsigned int best=0; + for(unsigned int i=0; i=taxEdge.size()) + best=-1; + return best; + } + + string SeqNodeGraph::getTokenSeq(unsigned int index){ + return tokenSeq[index]; + } + + int SeqNodeGraph::getIndexStart(){ + return indexStart; + } + int SeqNodeGraph::getIndexFinish(){ + return indexFinish; + } + unsigned int SeqNodeGraph::getTotNumSeq(){ + return totNumSeq; + } + double SeqNodeGraph::getTaxEdgeInPosition(unsigned int i){ + return taxEdge[i]; + } + char SeqNodeGraph::getCharOfTokenSeq(unsigned int seqNum, unsigned int pos){ + return tokenSeq[seqNum][pos]; + } + unsigned int SeqNodeGraph::getTokenSize(){ + return tokenSize; + } + double SeqNodeGraph::getAverageTax(){ + return averageTax; + } + + + + + // OPERATORS: + void SeqNodeGraph::setIndexStart(unsigned int i){ + indexStart=i; + } + void SeqNodeGraph::setIndexFinish(unsigned int i){ + indexFinish=i; + } + void SeqNodeGraph::setTotNumSeq(int i){ + numSeq=i; + } + void SeqNodeGraph::setTaxEdgeInPosition(unsigned int i, double tax){ + taxEdge[i]=tax+taxEdge[i]; + } + void SeqNodeGraph::setTokenSize(int size){ + tokenSize=size; + } + + /** + *@Description for print all edge-taxes of one node + * + */ + void SeqNodeGraph::printTaxEdge(){ + cout<<"tax vector"< vNode array of nodes + * + */ + void SeqNodeGraph::setNode(SeqNodeGraph* node, vector vNode){ + //matrix config + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //Default matrix + string matrixFileName="blosum62.dat"; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + //end matrix config + + for(unsigned int i=0; igetTotNumSeq() ;j++) + {//how seq in node[i] of vNode + for(unsigned int x=0; xgetTokenSize();x++) + {//for all char in one string in node[i] of vNode + for(unsigned int y=0; ygetTotNumSeq();y++) + {//for all token of string in node + for(unsigned int w=0; wgetTokenSize();w++) + {//for all char in each seq of node + node->setTaxEdgeInPosition(i,sub.score[node->getCharOfTokenSeq(y,w)][vNode[i]->getCharOfTokenSeq(j,x)]); + + }//end for + + }//end for + }//end for + }//end for + }//end final for + + node->calculateAverageTax(); + } + + + /** + *@Description for a node calculate all value of edges for all other node consider weigth by guide tree of sequence + *@param SeqNodeGraph* node the node of graph That will be set + *@param vector vNode array of nodes + *@param vector vWeigth1 array of Weigth of first array of sequence + *@param vector vWeigth2 array of Weigth of second array of sequence + */ + void SeqNodeGraph::setNodeWithWeigth(SeqNodeGraph* node, vector vNode,vector vWeigth1,vector vWeigth2){ + //matrix config + string path = getenv("VICTOR_ROOT"); + if (path.length() < 3) + cout << "Warning: environment variable VICTOR_ROOT is not set." << endl; + + string dataPath = path + "data/"; + + //Default matrix + string matrixFileName="blosum62.dat"; + matrixFileName = dataPath + matrixFileName; + ifstream matrixFile(matrixFileName.c_str()); + if (!matrixFile) + ERROR("Error opening substitution matrix file.", exception); + SubMatrix sub(matrixFile); + //end matrix config + + for(unsigned int i=0; igetTotNumSeq() ;j++) + {//how seq in node[i] of vNode + for(unsigned int x=0; xgetTokenSize();x++) + {//for all char in one string in node[i] of vNode + for(unsigned int y=0; ygetTotNumSeq();y++) + {//for all token of string in node + for(unsigned int w=0; wgetTokenSize();w++) + {//for all char in each seq of node + node->setTaxEdgeInPosition(i,sub.score[node->getCharOfTokenSeq(y,w)][vNode[i]->getCharOfTokenSeq(j,x)]*vWeigth1[y]*vWeigth2[j]); + //cout<getCharOfTokenSeq(y,w)][vNode[i]->getCharOfTokenSeq(j,x)]*vWeigth1[y]*vWeigth2[j]<addTest(new CppUnit::TestCaller("Test1 - Calculate distance of 2 seq.", + &TestPhylo::testPhylo_A)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test2 - Insert Gap in a seq.", + &TestPhylo::testPhylo_B)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test3 - Calculate Multi Aling with weigth.", + &TestPhylo::testPhylo_C)); + suiteOfTests->addTest(new CppUnit::TestCaller("Test4 - Calculate Multi Aling with weigth and deletetion/insertion.", + &TestPhylo::testPhylo_D)); + + return suiteOfTests; + } + + /// Setup method + + void setUp() { + } + + /// Teardown method + + void tearDown() { + } + +protected: + + void testPhylo_A() { + string seq1="AAABBCCC"; + string seq2="AAABBCCC"; + double distance=Phylo::PhyloSupport::distanceCalcTwoSeq(seq1,seq2); + CPPUNIT_ASSERT( distance == 0 ); + seq2="DDDDDDDD"; + distance=Phylo::PhyloSupport::distanceCalcTwoSeq(seq1,seq2); + CPPUNIT_ASSERT( distance == 1 ); + } + + void testPhylo_B() { + + string seq="AAABBCCC"; + int pos=3; + seq=Phylo::PhyloSupport::insertGapPosition(seq,pos); + CPPUNIT_ASSERT(seq.substr(3,1) == "-"); + } + + void testPhylo_C() { + string a="AAAAAAAA"; + string b="AAAAAAAA"; + string c="AAAAAAAA"; + string o="AAAAAAAA"; + string t="AAAAAAAA"; + vector num(2); + vector letter(3); + num[0]=o; + num[1]=t; + letter[0]=a; + letter[1]=b; + letter[2]=c; + + vector w1(2); + vector w2(3); + w1[0]=1; + w1[1]=1; + w2[2]=1; + w2[0]=1; + w2[1]=1; + vector result=Phylo::PhyloSupport::AlingMultiSvsMultiS(num,letter,w1,w2,false,1); + for(unsigned int i=0;i num(2); + vector letter(3); + num[0]=o; + num[1]=t; + letter[0]=a; + letter[1]=b; + letter[2]=c; + + vector w1(3); + vector w2(2); + w1[0]=1; + w1[1]=1; + w1[2]=1; + w2[0]=1; + w2[1]=1; + vector result=Phylo::PhyloSupport::AlingMultiSvsMultiS(num,letter,w1,w2,false,2); + CPPUNIT_ASSERT(result[0][0] == '-'); + CPPUNIT_ASSERT(result[0][2] == 'C'); + + } + +}; diff --git a/Phylo/Tests/TestPhylogen.cc b/Phylo/Tests/TestPhylogen.cc new file mode 100644 index 0000000..2e7f493 --- /dev/null +++ b/Phylo/Tests/TestPhylogen.cc @@ -0,0 +1,80 @@ +/* + * TestPhylogen.cc + * + * Created on: 02 07 2015 + * Author: Matteo Del Pioluogo + */ + +#include +#include +#include + +#include + +using namespace std; + +int main(){ + CppUnit::TextUi::TestRunner runner; + cout << "Creating Test Suites: OK" << endl; + runner.addTest(TestPhylo::suite()); + cout<< "Running the unit tests."<sp|P00125|CY1_BOVIN Cytochrome c1, heme protein, mitochondrial OS=Bos taurus GN=CYC1 PE=1 SV=2 +MAAAAATLRGAMVGPRGAGLPGARARGLLCGARPGQLPLRTPQAVSLSSKSGLSRGRKVI +LSALGMLAAGGAGLAVALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CSSCHSMDYVAYRHLVGVCYTEDEAKALAEEVEVQDGPNEDGEMFMRPGKLSDYFPKPYP +NPEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIGMAPPIYNEVLEFDDGTPATMSQVAKDVCTFLRWAAEPEHDHRKRMGLKMLLMMGLLL +PLVYAMKRHKWSVLKSRKLAYRPPK +>sp|P08574|CY1_HUMAN Cytochrome c1, heme protein, mitochondrial OS=Homo sapiens GN=CYC1 PE=1 SV=3 +MAAAAASLRGVVLGPRGAGLPGARARGLLCSARPGQLPLRTPQAVALSSKSGLSRGRKVM +LSALGMLAAGGAGLAMALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CASCHSMDFVAYRHLVGVCYTEDEAKELAAEVEVQDGPNEDGEMFMRPGKLFDYFPKPYP +NSEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIAMAPPIYTDVLEFDDGTPATMSQIAKDVCTFLRWASEPEHDHRKRMGLKMLMMMALLV +PLVYTIKRHKWSVLKSRKLAYRPPK +>sp|P00044|CYC1_YEAST Cytochrome c iso-1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=CYC1 PE=1 SV=2 +MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDANIK +KNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE +>sp|P30183|CCB11_ARATH Cyclin-B1-1 OS=Arabidopsis thaliana GN=CYCB1-1 PE=1 SV=2 +MMTSRSIVPQQSTDDVVVVDGKNVAKGRNRQVLGDIGNVVRGNYPKNNEPEKINHRPRTR +SQNPTLLVEDNLKKPVVKRNAVPKPKKVAGKPKVVDVIEISSDSDEELGLVAAREKKATK +KKATTYTSVLTARSKAACGLEKKQKEKIVDIDSADVENDLAAVEYVEDIYSFYKSVESEW +RPRDYMASQPDINEKMRLILVEWLIDVHVRFELNPETFYLTVNILDRFLSVKPVPRKELQ +LVGLSALLMSAKYEEIWPPQVEDLVDIADHAYSHKQILVMEKTILSTLEWYLTVPTHYVF +LARFIKASIADEKMENMVHYLAELGVMHYDTMIMFSPSMVAASAIYAARSSLRQVPIWTS +TLKHHTGYSETQLMDCAKLLAYQQWKQQEEGSESSTKGALRKKYSKDERFAVALIPPAKA +LLTGTESA +>sp|Q0JF02|CPS4_ORYSJ Syn-copalyl diphosphate synthase OS=Oryza sativa subsp. japonica GN=CPS4 PE=2 SV=1 +MPVFTASFQCVTLFGQPASAADAQPLLQGQRPFLHLHARRRRPCGPMLISKSPPYPASEE +TREWEADGQHEHTDELRETTTTMIDGIRTALRSIGEGEISISAYDTSLVALLKRLDGGDG +PQFPSTIDWIVQNQLPDGSWGDASFFMMGDRIMSTLACVVALKSWNIHTDKCERGLLFIQ +ENMWRLAHEEEDWMLVGFEIALPSLLDMAKDLDLDIPYDEPALKAIYAERERKLAKIPRD +VLHSMPTTLLHSLEGMVDLDWEKLLKLRCLDGSFHCSPASTATAFQQTGDQKCFEYLDGI +VKKFNGGVPCIYPLDVYERLWAVDRLTRLGISRHFTSEIEDCLDYIFRNWTPDGLAHTKN +CPVKDIDDTAMGFRLLRLYGYQVDPCVLKKFEKDGKFFCLHGESNPSSVTPMYNTYRASQ +LKFPGDDGVLGRAEVFCRSFLQDRRGSNRMKDKWAIAKDIPGEVEYAMDYPWKASLPRIE +TRLYLDQYGGSGDVWIGKVLHRMTLFCNDLYLKAAKADFSNFQKECRVELNGLRRWYLRS +NLEKFGGTDPQTTLMTSYFLASANIFEANRAAERLGWARVALLADAVSSHFRRIGGPKNS +TSNLEELISLVPFDDAYSGSLREAWKQWLMAWTAKESSQESIEGDTAILLVRAIEIFGGR +HVLTGQRPDLWEYSQLEQLTSSICCKLSRRVLAQENGESTEKVEEIDQQVDLEMQELTRR +VLQGCSAINRLTRETFLHVVKSFCYVAYCSPETIDSHIDKVIFQDVI +>sp|P00048|CYC_NEUCR Cytochrome c OS=Neurospora crassa (strain ATCC 24698 / 74-OR23-1A / CBS 708.71 / DSM 1257 / FGSC 987) GN=cyc-1 PE=1 SV=2 +MGFSAGDSKKGANLFKTRCAQCHTLEEGGGNKIGPALHGLFGRKTGSVDGYAYTDANKQK +GITWDENTLFEYLENPKKYIPGTKMAFGGLKKDKDRNDIITFMKEATA +>sp|Q9D0M3|CY1_MOUSE Cytochrome c1, heme protein, mitochondrial OS=Mus musculus GN=Cyc1 PE=1 SV=1 +MAAAAASLRRTVLGPRGVGLPGASAPGLLGGARSRQLPLRTPQAVSLSSKSGPSRGRKVM +LSALGMLAAGGAGLAVALHSAVSASDLELHPPSYPWSHRGLLSSLDHTSIRRGFQVYKQV +CSSCHSMDYVAYRHLVGVCYTEEEAKALAEEVEVQDGPNDDGEMFMRPGKLSDYFPKPYP +NPEAARAANNGALPPDLSYIVRARHGGEDYVFSLLTGYCEPPTGVSLREGLYFNPYFPGQ +AIGMAPPIYTEVLEYDDGTPATMSQVAKDVATFLRWASEPEHDHRKRMGLKMLLMMGLLL +PLTYAMKRHKWSVLKSRKLAYRPPK +>sp|Q9K499|CYC1_STRCO Epi-isozizaene synthase OS=Streptomyces coelicolor (strain ATCC BAA-471 / A3(2) / M145) GN=cyc1 PE=1 SV=1 +MHAFPHGTTATPTAIAVPPSLRLPVIEAAFPRQLHPYWPKLQETTRTWLLEKRLMPADKV +EEYADGLCYTDLMAGYYLGAPDEVLQAIADYSAWFFVWDDRHDRDIVHGRAGAWRRLRGL +LHTALDSPGDHLHHEDTLVAGFADSVRRLYAFLPATWNARFARHFHTVIEAYDREFHNRT +RGIVPGVEEYLELRRLTFAHWIWTDLLEPSSGCELPDAVRKHPAYRRAALLSQEFAAWYN +DLCSLPKEIAGDEVHNLGISLITHHSLTLEEAIGEVRRRVEECITEFLAVERDALRFADE +LADGTVRGKELSGAVRANVGNMRNWFSSVYWFHHESGRYMVDSWDDRSTPPYVNNEAAGE +K +>sp|Q753F4|CYC_ASHGO Cytochrome c OS=Ashbya gossypii (strain ATCC 10895 / CBS 109.51 / FGSC 9923 / NRRL Y-1056) GN=CYC1 PE=3 SV=2 +MPAPYKPGSEKKGATLFKTRCLQCHTTEKGGPHKVGPNLHGVFSRQSGQVSGYSYTDAMI +NRNVTWDAQSMSDYLENPKKYIPGTKMAFGGLKKEKDRNDLITYMLKACK +>sp|P25400|CYC_CANGA Cytochrome c OS=Candida glabrata (strain ATCC 2001 / CBS 138 / JCM 3761 / NBRC 0622 / NRRL Y-65) GN=CYC1 PE=3 SV=1 +MSEKKGATLFKTRCLQCHTVEKGGPNKVGPNLHGIFGRKSGQAAGYSYTDANIKKNVTWD +EDNMSDYLTNPKKYIPGTKMAFGGLKKEKDRKDLIAYLKKATSD +>sp|P53698|CYC_CANAL Cytochrome c OS=Candida albicans (strain SC5314 / ATCC MYA-2876) GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGVFGRKSGLAEGYSYTDANK +KKGVEWTEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLVTYLKKATS +>sp|P15451|CYC_CHLRE Cytochrome c OS=Chlamydomonas reinhardtii GN=CYC1 PE=3 SV=2 +MSTFAEAPAGDLARGEKIFKTKCAQCHVAEKGGGHKQGPNLGGLFGRVSGTAAGFAYSKA +NKEAAVTWGESTLYEYLLNPKKYMPGNKMVFAGLKKPEERADLIAYLKQATA +>sp|P00043|CYC_DEBHA Cytochrome c OS=Debaryomyces hansenii (strain ATCC 36239 / CBS 767 / JCM 1990 / NBRC 0083 / IGC 2968) GN=CYC1 PE=1 SV=3 +MPAPYEKGSEKKGANLFKTRCLQCHTVEEGGPHKVGPNLHGVVGRTSGQAQGFSYTDANK +KKGVEWSEQNLSDYLENPKKYIPGTKMAFGGLKKAKDRNDLISYLVKATK +>sp|P00041|CYC_ISSOR Cytochrome c OS=Issatchenkia orientalis GN=CYC1 PE=1 SV=1 +PAPFEQGSAKKGATLFKTRCAQCHTIEAGGPHKVGPNLHGIFSRHSGQAEGYSYTDANKR +AGVEWAEPTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLVTYMLEASK +>sp|O13393|CYC_PICST Cytochrome c OS=Scheffersomyces stipitis (strain ATCC 58785 / CBS 6054 / NBRC 10063 / NRRL Y-11545) GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEEGGPHKVGPNLHGIMGRKSGQAVGYSYTDANK +KKGVEWSEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLVTYLASATK +>sp|P00046|CYC_SCHPO Cytochrome c OS=Schizosaccharomyces pombe (strain 972 / ATCC 24843) GN=cyc1 PE=1 SV=3 +MPYAPGDEKKGASLFKTRCAQCHTVEKGGANKVGPNLHGVFGRKTGQAEGFSYTEANRDK +GITWDEETLFAYLENPKKYIPGTKMAFAGFKKPADRNNVITYLKKATSE +>sp|Q6E7D7|CPS4_ORYSI Syn-copalyl diphosphate synthase OS=Oryza sativa subsp. indica GN=CPS4 PE=2 SV=1 +MPVFTASFQCVTLFGQPASAADAQPLLQGQRPFLHLHARRRRPCGPMLISKSPPYPASEE +TREWEAEGQHEHTDELRETTTTMIDGIRTALRSIGEGEISISAYDTSLVALLKRLDGGDG +PQFPSTIDWIVQNQLPDGSWGDASFFMMGDRIMSTLACVVALKSWNIHTDKCERGLLFIQ +ENMWRLAHEEEDWMLVGFEIALPSLLDMAKDLDLDIPYDEPALKAIYAERERKLAKIPRD +VLHAMPTTLLHSLEGMVDLDWEKLLKLRCLDGSFHCSPASTATAFQQTGDQKCFEYLDGI +VKKFNGGVPCIYPLDVYERLWAVDRLTRLGISRHFTSEIEDCLDYIFRNWTPDGLAHTKN +CPVKDIDDTAMGFRLLRLYGYQVDPCVLKKFEKDGKFFCLHGESNPSSVTPMYNTYRASQ +LKFPGDDGVLGRAEVFCRSFLQDRRGSNRMKDKWAIAKDIPGEVEYAMDYPWKASLPRIE +TRLYLDQYGGSGDVWIGKVLHRMTLFCNDLYLKAAKADFSNFQKECRVELNGLRRWYLRS +NLERFGGTDPQTTLMTSYFLASANIFEPNRAAERLGWARVALLADAVSSHFRRIGGPKNL +TSNLEELISLVPFDDAYSGSLREAWKQWLMAWTAKESSQESIEGDTAILLVRAIEIFGGR +HVLTGQRPDLWEYSQLEQLTSSICRKLYRRVLAQENGKSTEKVEEIDQQLDLEMQELTRR +VLQGCSAINRLTRETFLHVVKSFCYVAYCSPETIDNHIDKVIFQDVI +>sp|Q54D07|CY1_DICDI Cytochrome c1, heme protein, mitochondrial OS=Dictyostelium discoideum GN=cyc1 PE=3 SV=1 +MFSQKLLANGKLLSKLAIVSGVVGTATAITTSTTLLSDDFVQPTSYPWEHRFPWQSYDHA +AIRRGHQVYTQVCSTCHSLNLISYRHLANVAYTPEELKAMAADTTVMDGPDSEGDMFERK +GQVTDFHPKPYPNSNAARFANNGALPPDLSLVIKARGAHEDYVFSLLTGYCEPPAGVRVV +GGQYFNPYFPGTKIAMAPPLADGMVEYDDGTDNSMSQMAKDVSTFLCWASEPEHDDRKKL +GMKVLLGVSIIALPLFYWKRLKWSVIKTRKISFKD +>sp|P74917|CY552_ACIFR Cytochrome c-552 OS=Acidithiobacillus ferrooxidans GN=cyc1 PE=1 SV=2 +MTTYLSQDRLRNKENDTMTYQHSKMYQSRTFLLFSALLLVAGQASAAVGSADAPAPYRVS +SDCMVCHGMTGRDTLYPIVPRLAGQHKSYMEAQLKAYKDHSRADQNGEIYMWPVAQALDS +AKITALADYFNAQKPPMQSSGIKHAGAKEGKAIFNQGVTNEQIPACMECHGSDGQGAGPF +PRLAGQRYGYIIQQLTYFHNGTRVNTLMNQIAKNITVAQMKDVAAYLSSL +>sp|Q9AJE4|CYC1_KITGR Terpentedienyl-diphosphate synthase OS=Kitasatospora griseola GN=cyc1 PE=1 SV=1 +MKDRAADPVTKFSPSPYETGQFLRISERADVGTPQIDYLLATQRPDGLWGSVGFELVPTL +GAVAGLSSRPEYADRAGVTDAVARACEKLWELALGEGGLPKLPDTVASEIIVPSLIDLLS +EVLQRHRPAVGGKAGQEQEFPSPPGANAELWRQLSDRIARGQAIPKTAWHTLEAFHPLPK +QFAATVTPAADGAVTCSPSSTAAWLSAVGTDAGASTRAYLDEAQSRYGGAIPMGSSMPYF +EVLWVLNLVLKYFPDVPIPREIIEEIAAGFSDSGIGGGPGLPPDGDDTAYANLAGDKLGA +PTHPEILMKFWAEDHFVSYPGEQTPSETVNAHALEYLNHLRMRRGITEFGAVEDACAEWV +ISQQTEDGCWYDKWNVSPYYSTAACVEALLDARKQDEPQLDSLRRAREWLLRHQTDSGGW +GMAEPSPEETAYAVLALDLFASRGGEGAEECAAAISRAKEFFTDESRENPPLWMGKDLYT +PFRIVDVTVMCGRAVVGRY +>sp|Q4HVX7|CYC_GIBZE Cytochrome c OS=Gibberella zeae (strain PH-1 / ATCC MYA-4620 / FGSC 9075 / NRRL 31084) GN=CYC1 PE=3 SV=2 +MAGGDIKKGANLFKTRCAQCHTVEKDGGNKIGPALHGLWGRKTGSVEGYSYTDANKQKGI +EWNDDTLFEYLENPKKYIPGTKMAFGGLKKAKDRNDLIAYLKDSTK +>sp|P19681|CYC_SCHOC Cytochrome c OS=Schwanniomyces occidentalis GN=CYC1 PE=3 SV=2 +MPAPYEKGSEKKDANLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRKSGQAAGYSYTDANK +KKGVEWTEQTMSDYLENPKKYIPGTKMAFGGLKKPKDRNDLITYLANATK +>sp|Q6C9Q0|CYC_YARLI Cytochrome c OS=Yarrowia lipolytica (strain CLIB 122 / E 150) GN=CYC1 PE=3 SV=1 +MGYKEGSAKKGATLFKTRCAQCHTTEAGGPHKVGPNLHGVINRHSGEAEGYSYSDANKRK +GIEWTTEHLFEYLENPKKYIPGTKMAFGGLKKPKDRNDLITWMVENC +>sp|P41179|CCN1_TRYBB G2/mitotic-specific cyclin-1 OS=Trypanosoma brucei brucei GN=CYC1 PE=2 SV=1 +MMTNLNVRTIKGCFPGSLPDALNDMSVINHILSLTGRFDEGQAALASTVNVVYVGTAVYD +IPRYREEYTKNFIARGCGISEVCVAEARSTGATPCAAATMVTPDQLQHLAEAHIILLPEV +TPFSLYDAGRKRASMRVSGQVAARGVVLVGGGCCFRAEHSDSANPKTCAQYMLSRENEVD +SGRPVEMEEGGAKWEYLCVHGLSVLPGIFCPQHSSRDATGLLLNESFSKMLKRHPTERGI +GVDCRAVLLLMGDGRYQVLTIANREGRTASVKDINIQIKDVVEGNVQTTTIQQQGSVEEL +LRKPCGPVVRDPFEAYYAMANPTALTEKLLCAPR +>sp|Q06374|CG1_COLGL Putative G1/S-specific cyclin OS=Colletotrichum gloeosporioides GN=CYC1 PE=2 SV=1 +MLKRATVDVSSPADVPQPKRRKAYLTGADQARDEYAEDAIKHMEYIESETTPDKRCIPAS +VLEMRPEVIRSLIKIHKELNLLPQTLLLSVNLLDRFCCKKDFLGQVYYRVFSCAALWIAS +KFVEDKDRVPSVCRVLDYVPSGMYCNSALFFRLEICMLNVLAWRVNHPTPWCFIQLFCPG +EGNKTKVRELALDICTARLPENRWMAIRPSILARHCLLHACNNLFSDTSLVEMYGSMPRD +KPSTH +>sp|O93863|CYC_PACTA Cytochrome c OS=Pachysolen tannophilus GN=CYC1 PE=3 SV=3 +MPAPYEKGSAKKGATLFKTRCLQCHTTEAGGAHKVGPNLNGVFGRHSGQAEGYSYTDANK +QKGALWEAQTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLVTYLLSATK +>sp|Q6Q4H8|CYC_PICPA Cytochrome c OS=Komagataella pastoris GN=CYC1 PE=3 SV=3 +MPAPFEKGSEKKGATLFKTRCLQCHTVEAGGPHKVGPNLHGVFGRKSGLAEGYSYTDANK +RKGVEWSEQTMSDYLENPKKYIPGTKMAFGGLKKAKDRNDLITYLASATK