diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 856dbfb..215b7f4 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -41,6 +41,11 @@ jobs: pwd pip install . path=$(fdog.setup -d ./ --getSourcepath); for i in $(less $path/data/dependencies.txt); do sudo apt-get install -y -qq $i; done + path=$(fdog.setup -d ./ --getSourcepath); ls $path/data/ + echo "#########################################" + pwd + ls + echo "#########################################" echo "TEST fdog.setup" fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS echo "TEST fdog.checkData" @@ -49,6 +54,8 @@ jobs: fdog.showTaxa echo "TEST fdog.run" fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 --fasOff --group mammalia + echo "TEST fdog.assembly" + fdog.assembly --gene test --refSpec HUMAN@9606@qfo24_02 --augustus --augustusRefSpec human --coregroupPath core_orthologs/ --out test_assembly --fasoff mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" diff --git a/.gitignore b/.gitignore index 38cf321..d1b7979 100644 --- a/.gitignore +++ b/.gitignore @@ -128,8 +128,16 @@ dmypy.json # Pyre type checker .pyre/ +# DS_store +**/.DS_Store +/fdog/.DS_Store +/fdog/data/.DS_Store +/fdog/bin/.DS_Store +/fdog/setup/.DS_Store + #Hannah /fdog/data/core_orthologs/ /fdog/data/assembly_dir/ /fdog/fdog_goes_assembly/tmp/ taxdump* +.DS_Store diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/fDOG-Assembly.iml b/.idea/fDOG-Assembly.iml new file mode 100644 index 0000000..8e5446a --- /dev/null +++ b/.idea/fDOG-Assembly.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..b44cad8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..6773d30 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index a85e62a..54177f6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -recursive-include fDOG * +recursive-include fdog/data * diff --git a/README.md b/README.md index d51cd4d..654f792 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) +# Poster fDOG - Assembly +https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) diff --git a/fdog/.gitignore b/fdog/.gitignore new file mode 100644 index 0000000..1912743 --- /dev/null +++ b/fdog/.gitignore @@ -0,0 +1,143 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# DS_store +**/.DS_Store +/fdog/.DS_Store +/fdog/data/.DS_Store +/fdog/bin/.DS_Store +/fdog/setup/.DS_Store + +#Hannah +/fdog/data/core_orthologs/ +/fdog/data/assembly_dir/ +/fdog/fdog_goes_assembly/tmp/ +taxdump* +/fdog/fDOGassembly.py diff --git a/fdog/addAssembly.py b/fdog/addAssembly.py new file mode 100644 index 0000000..171adb6 --- /dev/null +++ b/fdog/addAssembly.py @@ -0,0 +1,124 @@ +import fdog.libs.addtaxon as addTaxon_fn +import fdog.libs.tree as tree_fn +import fdog.libs.zzz as general_fn +import sys +import os +import argparse + +def check_fasta(file): + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) + if not nHeader == nSeq: + return(1) + nPipe = general_fn.count_line(file, '|', True) + if nPipe > 0: + return(1) + return(0) + +def check_path(path): + if not os.path.exists(path): + return False + else: + if os.path.isfile(path): + return "File" + else: + return "Path" + +def parse_file(path): + file = open(path, "r") + lines = file.readlines() + id_dict = {} + for line in lines: + line = line.rstrip() + ncbi, name = line.split("\t") + id_dict[ncbi] = name + file.close() + return id_dict + + +def main(): + print("#################################") + #################### handle user input ##################################### + version = '0.0.3' + ################### initialize parser ###################################### + parser = argparse.ArgumentParser(description='You are running fdog.addAssembly version ' + str(version) + '.') + ################## required arguments ###################################### + required = parser.add_argument_group('Required arguments') + required.add_argument('--fasta', help='Path to fasta file or folder', action='store', default='', required=True) + required.add_argument('--out', help='Path to output folder.', action='store', default='', required=True) + required.add_argument('--ncbi', help='NCBI ID of species or a mapping file (tab separated) containing the NCBI ID and the corresponding file name placed in the folder given by --fasta. ', action='store', default='', required=True) + required.add_argument('--ver', help='Version', action='store', default='', required=True) + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--link', help='links fasta files instead of copying them', action='store_true', default = False) + + args = parser.parse_args() + fasta = args.fasta + if check_path(fasta) == False: + print("%s does not exists. Exiting ..."%(fasta)) + sys.exit() + else: + format = check_path(fasta) + out_folder = args.out + out_folder = os.path.abspath(out_folder) + '/' + os.system('mkdir %s >/dev/null 2>&1' % (out_folder)) + ncbi = args.ncbi + ver = args.ver + ln = args.link + id_dict = {} + + if check_path(ncbi) == False: + if ncbi.isdigit() and format == "File": + id_dict[ncbi] = fasta + else: + print("%s is no file or digit. Exiting ..."%(ncbi)) + sys.exit() + elif check_path(ncbi) == "File": + print("Parsing mapping file ...") + id_dict = parse_file(ncbi) + print("... done") + else: + print("%s is no file or digit. Exiting ..."%(ncbi)) + sys.exit() + #print(format) + #print(fasta) + if format == "File": + fa = id_dict[ncbi] + if check_fasta(fa): + name = addTaxon_fn.generate_spec_name(ncbi, "", ver) + if ln == False: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("cp %s %s/%s.fa" %(fa, assembly_folder, name)) + else: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("ln %s %s/%s.fa" %(fa, assembly_folder, name)) + else: + print("%s Fasta format not valid or header includes |"%(fa)) + + else: + for sp in id_dict: + print("Adding species %s"%(sp)) + #print(id_dict) + fa = id_dict[sp] + fasta = os.path.abspath(fasta) + '/' + #print(fa) + #print(fasta) + fasta_path = fasta + fa + if check_fasta(fasta_path): + name = addTaxon_fn.generate_spec_name(sp, "", ver) + if ln == False: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("cp %s %s/%s.fa" %(fasta_path, assembly_folder, name)) + else: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("ln -s %s %s/%s.fa" %(fasta_path, assembly_folder, name)) + else: + print("%s Fasta format not valid or header includes |"%(fasta_path)) + + print("DONE, files can be found: %s"%(out_folder)) + +if __name__ == '__main__': + main() diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa b/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa new file mode 100644 index 0000000..41a868a --- /dev/null +++ b/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa @@ -0,0 +1,702 @@ +>CM000680.2 79953000-80009000 +CTCCTGCCTTGCGCCCCCAGGCTGGCGTTGGAACTTCTTTAGAGCCAGCGCTGCGGAGCCCCAGCACCAGGAACGTAGAG +GTGAGTAAAAGGCACAGCAGGAAGCCTGGGAGAGAAAAAGCAGCTGCCCCTGCTCAGCCGCGCCCTAGCCTGAGCCCAGC +AGCCTTTGGGATCATCTGTCCTCTCTGCCCTGGAGGTTGGAGTAAGCTCCGGACCTCCTTGACCCTCACGGGAGCACTGG +GGTCCCAGACGCTATCCTCGGTTATCTTAAATCCAGAGGCTGTGGAGGCATGGGGCAAATTGTGCCCGCAGCCTGGGGCC +TAGAGCTGCATCTGTGCACGGGGCCTCCAGGAGCCCGGCTCAGTGGACGAAATCTGGCATGCTCGGACGTCACCCCACAG +GCCTCACGCCTCAGAGGCACCCCGAAAGGTGAGTTGATGACTGCAGAGTGAGTTTGTACCTTTTCGAACACCACCTGCCG +GGGGTCcgagcccccacccccagtctACATCAgccaatcacagctcattgcaggtGCTGCGCTTGTCACATAGCTTCCTG +CAAACAAGAGAAGCCGACCAAGCCATGCACAGCTGGAAAATggcccttataaagccatcaagTCCTAATCCAGGCAACCT +GTAAATATTACCTCGTAtgggaaaacaatttttttttctaatgtcattaaggatcttgagatagagagattgtcctggat +tatccaggtgggccctaaacactgtggctcacgcctataatcccagcactttgggagcctgaggcaggtaaatctcttga +gcccaggagttcgagaccagcctgagcaacatggtgagacctcgtctctacaaaagatagaaaaattatctgggcatggt +agtgtgcacctgtagtcccagctactcctgaggtggaaggattgcttgaggccaggaggttgaggcagcagtgagccagg +atcgcaccactgcactctagcctcggtgacaaagtgagaccctatctcaaaaataaatacatacaacagagacagggtct +ctgtgttggccaggctggtcttgaactcctggcctcaagcgatcctcctgccttggcctctcaaagtgctgggatgacag +gcatgagccagcgtgcccggccACATGCATCTTtgtaagagggaggcagaaggaaatTTGGTGCCACACAGAAGAGAGGG +CCATGGGAAGATGGAGCAGAGATGAAGATGGGGCCTTGAAAATTGACGTGATGCAGTCCCAAGCCAAGTAACGCCGGAGC +CACCAGAGGCCCCAGAAACAAGGAGCAGATTCTCCCCAGGAGCCATGCGAGGGAGCCTGgccctgccaatgccttgattt +cAGCACATTAAAAGtgattttggacttttggcctccagagctgtgaagGAATAAACTTCTGCTGTTGTAAAAGCCACTTA +GCTGTGGTCATTTGTTGCAGGAGCCAATGAGTGACTGACAAGATCAGAGCTGTGACACAGAGGAAGCAGAGCTGTGCCTC +ATGCTTTCCAGGCTCTGCTCATGAGAAACAGGTCACTGGGTTGGCCCGGTGCAGATTAAGGTGTCACGTAGGAGCATGAA +TACCTCAGGGAACCACTTAAGAATTTGCCTAACACATGGTGCttataaaaggaaaagcaaacaaagcAACCTTGTTCTGA +GAGCTGGGACATCAGAGGAAGATTGGAGCCACTGTTTCTCACTCagcctatttctttttattttatttatttattttata +gagacggggagctcactatgttggccagtttggtctttgaactcctggcttcaaaggatcctctcacctaggcctcccaa +agtgctgggattacaggagtgagccaccttgctGGTCGCGCACCCCATTTTTTAAGGAAACCTGTACGGAAGGTGGATTG +GAAGTCTCACTTTAGGAGCCTGGCTCAGAAGTTCCGCAACTTCAACTGGTGTCATGTGACGTGCTTGGCAGTCACCACCC +CCATCCTTACAGCAAAGAATAGCAGGGCAGCCTTTCATGGGCCCATCAGAGAACTAAGGCTGCCAGACAAACTGGCACCC +TGCTCTGAAGAGACAGGCACATCCAGGGAGAGACAGCACCTGAGAGCTGCTCACCCAGAGCAGAAACTCCTGGATAAACT +GGCTTTAGCAGAAAAATTAGACAACATGCAAGACCAGATAGGTGATATCAGcagaaagatggaaacaataataaaaaaaa +aatgctagaagtcaaaaacacaatagaaataaagaatgctCATTAGTTGACTAgacagaatgaaggaaagaatcagtgaa +tatGAAGATAAGTCAAcagaaacttcccaaactgaaattcaaGGAGAAAaggtgatgaaaaaaaaaaaaatcaactgatc +TAGTTTGAgtgtgtgtccccaccaaatctcatgttgagttatAATCACCagttttggaggtggggcttggcagggggtgt +ttggatcatgggggcgggttcctcatgaatggcttgcaCCATCGCTTCATCCCCTTGTGATGAGTAagctctcactctga +gttcatgtgagatctgtttttgtttttgtttttgttttgagacggagtctcgttctgttgcccagactggagtgcagtgg +cacagcacagtctcagcttactgcaacctcctgggtttaagtgattcttgtgcctcagtctcccgagtagctgggattac +aggcttgtgccactatgcctggctaatttattgtatttttagtagagatggggtttcaccatgttgaccaggctggtctc +aaactcctgacctcttgatccacccacctcggcctcccaaagtgctgcgattataggcatgagccaccgcgcccgcctga +gatctgtttttctttaaagtgtCTGGCACCTACCCAcaccctcactctctctctctcttgctcctgcttttaccGTGTga +aatgcctgctcccactttgccttccgccatgagtaaaagttccctgaggcccccaagaagctgagcagatgctggtgcca +tgctggTATGGCCTgcaagaactgtgagccaattaaatctcttctctttataaattactctgtctcaggtatttctttat +agcaatgcaagaacagcctaatacaccaACAGAAAACACCATCCAAGAACAACATCAAAGGATATGATTGGTACAATGGG +AGAATcggaaggaggaaagaggagaaggggCAGGAGAAATGGTGGAATGGCAAGAATGTCCCCaaattaatgacagacac +caAACCATAGATCCAGGAAACATACAGAACATCCATCAggacaaataacaaaaaacataaagcaaaacaaaacaaaaaac +cacacctAGGCTTGTCACATTCAAACTGAATAAACCTGCCTGACTGAACCCTCGTCCACCAGCCACAGCTACACCTCTGA +CAAGACAAGAGACAGATTTCAGTAACTCTCCTGGTAAGAGACCACTGACCATGGCTGGTCCCGGCTGGTTTACAAACTCT +GTGCACCAAGTGCCTTTGAGTCTTAGAAAGACCTTTTGATGTATAGGGCCTAACTGTAATACATGTAAATGTTACATCTc +taccccaaagtgaacatgggtcaTGTGATACATACAGCAAGACccccttcatgaatattcatagctcctcatGTAACCTA +TTAAGTATATTTAGCCAACCAAGTCAGCATAAAGCTCGTGCCCCATCCTCTCCTCCTCTGAAGGGCCTGTGTCTGGTCTT +GTGGGATGTCTATCTTGCaagttatttaagaaatagtCTCCTTTTCTAAATGCAGTGTTGTGTGATTTTTTAAGTTAACA +AGAggaatatgaataaaaatgaccGTGGACTTCTTGTTAGAAATCATGCGGATAACAAAAGGGTAgagtgaaatcttttt +ttttttttttttttgagacagagtctcgctctgttgtccaggctggagtgcagtggcgcaatctcggctcactacaagct +ccgcctcccgggttcacgccattctcctgtctcagcctcccgagtagctgggactacaggcacccgccaccacatccggc +taatttttttgtatttttagtagagacggggtttcacagtgttagccaggatggtctcgatctcctgacctcgtgatctg +cctgcctcggcctcccaaagtgttgggattacaggcgtgagccactgcgcctggccgtgaAATCCTTAAAGTGttgcagt +tgaaaaaaaaaaaatctgcccacCTAGAATTCCATATAGCAAAAGTATCCTTAAGGtgcaggaaaaataaagactttttc +agaaaaacaaaaactgagggaatgtATTGCCGGTAGACCTGCCCTGTGAGAAATGTTAGAGAAGAGACAAGGAAGATGAT +ACAGGTCAGAGGCTTAGATCTACATCAGGAAAGGAAAAGCATCAGAGGCAGAATAAATGAAGAGAATTAAtgtctcttgt +ttttcttttttttctttcttttttttttttttaagacagagttttgctcttgttgcccaggatggagtgcaatggcgcaa +tctcagctcactgcaacctccacctcccaggttcaagcaattctgcctcagcttcctgagtagcttggctTACAaggaca +tgccaccatgcccagctaattttttttttttttttttttttttgagacagagtctcgctctgtcacccaggctggagtgc +agtggtgcaatatcagctaactgcaagctccgcctcccgggttcacgccattctcctgcctcagcctctcaggtgctgga +actacaggcgcccgccaccgcgcccggctaattttttgtatttttagtagagatggggtttcactgtgttagccaggatg +gtctcgatctcctgacctcgtgacctgcccaccttggcctcccaaagtgctgggattacaggcgtgagccaccgtgcccg +gcctataattttttgtatttttagtagagaccgggtttcgccatgttcgccaggctggtctcgaactcctgtcctcaggt +aaTCAACCCATCTCAGTCtcgcagagtgctgggattatacgcgtgAACCACCTcgttattcttttttgtttttgaggcgg +agtcttgctctgttgcccaggctggagtgcagtggcgcgatctcagctcactgcaagctccgcctcccgggttcaggcca +ttcttctgcctcagccttccgagtagctgggactacaggcgcccaccaccatgcccagctaatcttttgtatttttatag +agacggggtttcactgtgttagccaggatggtcttgatctcctgacctcatgatccgcccgcctcggcctaccaaagtgc +tgggattacaggtgtgagccaccgcgcccggccaacctcgttattcttaattgatctaaagACAACTGTTAAAAGCCATA +AAGTAACAATGTACTGGGTGATTACAGCTTGTGGACAAAATGTCACAAGGGACAGTGGGAGGAGCTGGGACTTTTCTGAT +GTAAAATACTTCCATTGTGCTGCCCTCCCTACCACGGGTCTAAAGCTATATTCCACAGCTTAAGCCACTTGGTCACACAC +TGAAACCCCTGGGCAGCTTTAAAACCACAAATGCTCAAGTGCCCGGGCCCACCCGCAGAGCACCGATAAAGGGGACTGTG +CCAGTCAGGGTAGCCTTCTAAGCCTTCCCCAGAAGTTCTGATGTGCAGCCATATTTGAGAgccactgttttaaaattcac +acCTACAAAAGCTGCATGGTACAGGAGGTCCTCATTACCCACAATCATGACTTCTTGGAAAGACCACTTACTCAAACAGG +ACCAGAATTGCATAGTAAATAATTCATGGGATTAGAAAATAGTTGGTTGATATTGCATTAACAATGAAAATTAAACTTAT +GAAAATATTGcaattattttagaataagtaaAACAGagcgggccaggtgcggtggctcactcctataatcccaagacttt +gggaggctcaggtgggtggatcccttgagctcaggagttcaagaccagcctggtcaacatagtgagaccccccatctcta +ctaaaaatacaaaaattagtggggtacattggtgtgcatctgtagtcccagctactcaggggtctgaggtgggaggatca +cctgagcccaggaggtcaaggctgcagtgagtcgtaattctgccattgcactccagcctgggcaacagagcaagaccctg +tctcaaaaaaagaaaggagcaacAGTCTCTACTCAGATCTGTATACCCAAAtctatacccaaaagaactgcttttatata +atttttaaataaaacttttcaaacACAAAAGAGCGTAACATAATGAAAGGCATGTATCCATCAACCAGCATCAATAATTA +TAAAATCGCTCTTTTAATGAGTATAAAATAAGAACTGAACTTAACGTTTAATTGGCAGCGTTTCTCCATTTCTGATAATA +ACAGTGCACCCTAAAATCAGCAGCGTCTTAGATTTAGGGCCTCAGAAACAAGGCCACCCACACTGGATCCGAGGGGCAAC +TGGGCACCTCCATGGACACCCATGCTTCATAAAGACAGCTGCACCCTACACTGGGTTGGACTGGGACGTTTTCAGCTATC +CAAAATAAAACGTTGGTTTTCTAATTTAAAGCTTTGTTTCATGTCTACATTCTGCTCTTGTAGTCTGCATCTTCGCTCAC +GTAATTAGCTATTTCAGTCTTACCCAGACTGTTTTCTTCCCAATTGTCTGTCCTCACACCAGGGAAAgactggctaattt +ctgtaaCTTCAGAATACAcattaatattctttaatatttagaTTCCCATGGTCAATTATTCATCCTGGGCTAGTACTCGG +CAGTTTCTCTTTTGTCAGTTCATGCGCCACCCAGGCAGCATATGCTGAGGAAGGCACAATTTCTACATTTATCCTAAGCC +AAACATGAGCAGGAAAACTGAGAGCTATTCTAAACATAATTTTATCGGGTCAAAAAGAAACAACTGCTAGGCCGATCTGG +CCGATGACTCCTCCAGTAGCTGTGAAATCGAGAATGCACATCATTTTCTTGCTGTGTTCCTAAGGTAGAGGAGACCACTT +TGAAGACCCGCAGTCCATATTGGCATTGTAGAATTTGAGCAGAGggccattttgttctgtactaagaaaaattcttctgc +cttgagatgctgctaatctgtaaccctacccccaaccctgtgctccctgaaacacgtgctgtgtcaactcagggttaaat +ggattaagggctgtgcaggatgtgctttgttaaacaaatgcttgaaggcagcttgctcgttaagagtcatcaccactccc +taacctcaaaccactccctaatctcaagtactcagagacacaaaacactgcggaaggctgcagggacctctgcctgggaa +agccaggtattgtccaaggtttctccccatgtgatagtctgaaatatggcctcatgggaagggaaagacctgaccgtccc +ccagcccgacacccgtaaagggtctttGCTgagaggattagtaaaagaggaaagaacgCCTCTTTGaggttgagataaga +ggaaggcttctgtctcctgctcgtccctgggcaatgaaatgtctcggtgtaaagccaattgtatattccatctactgaga +taggggaaaaccgccttagggatggaggtgggacatgctggcagcaatactgctcctTAAGGCATTGAGATATTCGtgta +tatgcacatcaaaagcacagcacttttttctttacgttgtttatgatgcagagacatttgttcacgtTTTTACcttctga +ccttctctccactattatcctattaTCCTGCTGCCACGcccgataatgatcaataaatactaagggaactcagaggccgg +tgccaGTGTGGATCCGTATGCTGaacgccggtcccctgggccccctttttgtttctttatactttgtgtctctttctttt +ccaagtctctcgttccacctaacAAGAAACatccacaggtgtggaggggcaacccaccccttcactgTGTTGTCTCCATC +AGATCCAGCTGAAGAGCTTTGCTCGAAAGCCAGTCCTTGGCCAGCATGGCCTCGTGCCTGTAAGTCCAGcgctttggggg +gccgaggtgggaggatcacttgagcccaggagtttgagaccagccctggcaacatatggagatccctgtctctagaaaaa +taataaaaaatagtagccgggcatggtggcgtgcctctgtagtcccacctactcgggaggctgaggcaggaggatccctt +gagcccagttcaaggctacagtgagctatgattgcaccactgcactccagcctgggtgacacagcaaaacccaatcccag +aaaaaagaaaggcagtcCTGTCATCTCCACAGACCACCCGGCCTATGTCGGTGAGGCCTGAGGCTGCTGTGTGCGGCGAT +GCTGGGTGGTCCTACTCGGGCTGCCCAAGAAGCCTGACCTGAAAAAGCAGCCAGGGCTGGAGGGACGGCCCAATATGAGC +CTGGCCGATGCCggctggagaggagagaggcGTTCCCAGGTCAGAGGCAGGCTCGTCTCGTCTTCTGCAGGGCCAGTTCC +AGGCTCGTGTGGGCCAGTGGGGCTCCCTGCTCGTTTAGCTGTCTGTGGTTAGGAACGCATAGCCCATGAGGACAGGGTGC +CCAGCTTGAACCTGCACCCAGAGTCAGCCAGTCAACATCTGAAGATAACGGGTTCCACAGGGTGCCCACAAGCTCTCCAA +AGTCCCAACGCCAGTCTGCCTTCACGGCTGTCCGAAGTGCTTTATCACTGGTGAGCTCAGTGCTGTCAAGAGCTCTGTGG +CCCCTGCTCCGCCTGAGTAAGGGAGGAGCGGCAGCCAGGGAGCAGCCCCGCCCTGGAGACACCGAAAGCCCTTGGGTCTG +CGCCTGCCCCCACCACGAGGGAGCCGTCCCCTGAAGCCATGAGCTTGATGCTGGGCTGTCCCTGGAGTAAAGCTTCCGGG +AACTCCGTGCCTGCAGAGTAAGGGAAGAGGGGACGTTCTAGAACTTtctggaagggaggaaggagggcagTCTGACTGGC +TCCCTGCTCATTTAGCTGTCTGTGGTTAGGAAAGCTGCCTGTGGGCTGATGAGGAACCCTGTCGTGGCCCTGCCGTGGGG +CACCACTGCCCATCTCCTCCCGCCTCCAGAGATGAGCGCCCTCTGAAGAGCTTGGAGGGCTGACGCCAGCCCGCTGTGTC +TGCCTCCTCTCTACAGGCGTAATTTCCTTTTCCCCTAAGTTAGCTTGAGTGACTTCTGGTTCTTGACACTGAGGTGTGTT +TCTGATGGAAACACCACCGGCTTGTACTCCTTGCCCCCCACGCCAGGCAGGCTTTCCTGGGTGCAGGCCGTCCACCCTGT +ACTGTGGCATCTTctggccctgcctcctcctcctcagcagcCACCCACACTGGCTTTCCCGAGAAGCCAGAAGCCGCCCG +GCAGGATTCCTTAGGCCTGAATGGTCCTGGGAGTGGGCCGGGAGCCACAGCACAGCAGCCACTTTCCACCCACACCGCGT +CTTTCTCAGCCGTCCACACTGGCCCCTACAACTAAGTTTGTATCTCAGCCTCtctcagagacagaaaaaaagaggaggat +gCTAAAAGCACACAAAAAGTGGCAAAATggccatgcttttaaaaaataattttaacagcttcattgaaatgtaattcaca +taccacaaaACCCACCCATTTAAAGGGCATAATTCAACAGTTTTGAGTATATTTACAGGGTTGTGAAACCTTCTTAcgat +ctaattttagaacatttttgtccCCCCACAAAGAAACCCCAGACCTCCTAAATCTCTCCGCACTCTCCCTCGTCCCTGCA +GCCCCCAGCAACAGAAGTCAACTTTCTGTCTCCGTGGCCTTGCCTGTTCCAACTCTCCATGGACTTACCACTCAGGGTCT +TCACGCCTGGCCTCTGCCACCCAGCGTTGTTCCGGAGGCCCGCCTGCTTCGTAGCGTGCGTCAGCGCCTCACTCCTTTGT +GTGGCTGAGGAATATTCTATCATCTGCATTTTGTCACCTACTTATTGGgggtggacatttgggtggtttccaccTTTTGA +GCATTACTGATGCTGCAGTAGACTTGGTGCACTCGCGCTGTCGTTTCGGTATATGCCTAGGGGTGGCGCGGCTGGGCCCT +GTGGAAACTGCTTCACTTCTGAGGACCCACCCGACTTTTTCACAGCACTGCAGCCTTTCACAGCCCACCGGCTGCGTGGG +GGGGGCCACAATCCCCACACGCCTCTCGGACTTACGTTTGTCTCTCTGATTCTAGCCATGCTGGTGGGTTGCAGTGggag +ctcattgtggttttggtttgtgttttcCTGATGACTGGTTGTAATGAGAAAAACCGAGAATCACATTTAAAACCTTACCC +TAAAGTGGGAGTTGCTGAGAGACCAAAGAATGACTGGGCGGAGTCCAGCTTGGTGAGTAGGTGAGTTTATTAGGACTCAC +ATGTGGGGCACTCGTGGGCAGCAGGACAACTCTAGAGATGAGCGTGCTTCTGTCCCTAAGCTGCTTTTGAGCTAATTTTC +TGCCTCTTTGCGCCGTCTGTGTGATGGGCCTGCTCCCTGTGGTGGTTCTCACATCCCCTCCGGGCTGTTTGGGTTCTCAG +GGACACCTGCTCCTCGCTGGGCACCGTGGCCTGGACTCACCGCCTGGCCTTTAGGGTTCAGGCCACAGACATCCGCCCTG +AAGTAACCTGGCGGGAACCGCCACGCCACAGCGGTGGTGCTGGGTGGCTGTGTTTTATTTATCCGCACATGGAGACACCA +GAGCTTCCAGGTGTGGTCACTGGGCTGTGCCCTAGGGATGCTGGGGTGACCAGACATAGCCCTGTTCCCACAGCGCTGCA +CAGGGGCACAGACATGGCAACAAGACTGGTAACCCTAAAACACCGTCCTGGGAGGTCACCCACAAGGTGACGAGCCACAA +CCCAGGTGTGAAAGAGGCCAGGgaaggcccggcgtggtggcccAGGCCTgcggtcccagcactttgggaggccgaggcag +gaggatcacttgtgcccaggaattcgagaccagcctgggcaacatagtaagatcccgtctctacaaaaaataaaaaagat +cagccGGACCTGGTGGCGCgcgcccgtggtcccagctactggccaggccgaggcaggaggatcgcttgggcccgggaagt +cgaggctgcggGGAGCCATAgtcgccactgcaccccagcctgggccacagagcgagccCCCGTCTCTGAGAAACGCAGCA +GGGGTGACCGAGGCCCATGCACTGCCCGTGCGGAGCGTTTTGGGGGTGTAGGTGGCCGGTGGCGCCCACGGGCCCTCTTG +AGTAAGGCGGCTCCGCCCAGGCCGTCCCGGGGCCTCGGCTCGGCTCGGCTCGGGAAGCCGCAGAGCCTGGGGGCGCGGAC +CAGTCCTCCGAGGCGGCCGCTCGGTGACATTGCGTCCCTGCAGGTGCAGCGCCCGCCTCTCCGCTCCGGCCCCGCCTCCG +CCCTGGAACGCAGCGCGCTCCGCCCGAGGCCTCCCGGCGGCCCATACGGGAATCGCGGAGCTTAGCTGTCGCCACCTCGC +GCCGGGTCCGCGCGGCCCACGGGACCCCCCACTGACGCCCCCGGCCAGCGGTCCACATGGACGTGCGGGGCCCTGAAGCC +CCCGGCGGGCGCGCGCTGCGGGACGCGGTGAGCCCCTCCCCGACTCCTGCTTCTCTCTGGATGGGGGCGCCCCTGCGGTT +CTGGGGGGTTTTGAGGTCCTGGGGGGGGAGCCTGCGGTGCTGGGGGTGCCCTGCGGTCTTCGTGGGGCCCTGAGGTCCTG +GGGGGCCTGCGATCCTGAGGACTCCTGTGGTCCTGAGGAGTCCTGAGTGCCTGGGAGGCCTGTGGTCCTGGGGGGGCCCC +TGAAGACCTGGGGGACCCTGCGGTCCTGGGGGGGCCTGAGGTGCAGGGGGGAGCCCTGCTGTCCTGGGGGACCCTGCGAT +CCTCGGGGGGCCTGAGGTCCTGGGGGGAGCCCTGCAGTACTGGGGGGCATTGAGGTCGGGGTCAGTGGGGAAACAACTGG +GCCAAAGGGTGCCCGAAAGGGGACCCACGAGTGTCGCAGCGCCCAGGGCTCCGGCCTCTTCCGGGAAATCCTCCCGCTCC +CCAGGCCTTCCTACTTGCCAAAGAGTTCCAGGCCCACAAGAGGACTGGCTATGAGGAAGAGACCTGGAATCTGAAGGAAT +GTGTTGGGCGTTGTGCAAACCCTAACGTAAATTTCCTGACAAAGGTAGAAAGCCCTGGCATGGTTCAGAGGTGGGGCCTC +CTCCTATGTCGACGGGATTCTAGATTCACACCATGGTACGTGGGGCTCCCTGGGATCTTGAACCCCCAGCTGGGAGAATT +TTGTGCTTCTCAGCCTCACTGTTCTCATCTCACCCTGGACACGGTCGCAGGGTGAGGAGTAGATTACACCGTGAATGGTG +TGGGACGTCAGTCTGTAAACTACAAAGCAGTATATAAACAGGAAACGCTTTCAGTAGCAATGGTGGTATTTCTGTGTGAC +CTGGTAAGTAATTTCATAAGAATTCTTGAGGAGCGCAGGGTCACGAGATTTTCTCCTGCATCCTCCTCTGCATGCTTTAT +GGCCTTGGCTCTGGGCCAAGGTGTGTGATCTCCTGCATTCATGGTGTGTGGTGTGAGGAGGGGGATGGGTCCCTGCCCCG +TCTGCACGTGGCCCTCATGATTCCCGCGCtgcttgttgaaaagactttccttTCCCTCTGAATCCAAAGGCCTTAGCACC +TTTGTCAGAAGTCAACAGATTTATGGATGGGTTTATCccatgatatttattttttttatttttattttttgagacggagt +gtcactctgtcgcccaggctggagtgcagtggcgcgatctcggctcactgcaagctccgcctcccgggttcacgccattc +tcctgcctcagcctcccgagtagctgggactacaggcgcccgccactgcacccagctaattttttgtatttttagtagag +atggggtttcacctagtttgccaggatggtctcgatctcctgacctcctgatccacccgcctcggcctcccaaagtgctg +ggattacaggcgtgagccaccgcgcccggccttatcccatgatatttaaaatgtcatacgtgggccaggtgtggtggctc +atgcctgtgatcccagcactttgggaggtcaaggtgggcagatcacctgaggccatgagttcaagaccagcctggccaac +atggtgaaactccgtctctactgaaagtataaacattagctgggcctggtggcacatgcttgtaatcccagctacttgga +ggctgaggtgggagaatcacttgaatccaggaggcggaggttgtagtgagctgagatcgcgccattgcactccagcctgg +gcaacaaaacaagacttcatctcagaaaaaaaaaaaaactcatatgccaacagtaaatatttattcaattgtCTTACGGT +TTATTTTTTCAGGCAGAAAATCTATTTCAGGAACTTCAGGAACATTTTCAAGCTCTGACGGCAACATTAAACCTCAGAAA +TATCCTTTTCTACCTTTAACAAATGCTGTGATTCTTTCGGACTGGTAGATTATCATGGAGTATCTTTTTGTTGTCTGGTA +GTAGTAGGTAATAGTTTACTTAGGATTTCCCAGTATTTACTTCTGTGCTTTTATGTGGCTTCCTGATGTGTTAATTACCC +CTCACCTATAGCAAAAGCTGTACctccggccgggtgcagtggctcacgcctgtaatcccagcactttgggaggccgaggt +gggcagatcacgaggtcaggagatggagaccatcctggctaacacagtgaaaccccgtctctactaaaaatacaaaaaat +tagcccggcatggtggcgggcgcctgtagtcccaactactcaggaggctgaggcaggagaatggtgtgaacccgggaggt +ggagcttgcagtgagccgagatcgcgccactgcactccagcctgggtgacagagcgagactccgtctcaaaaaaaaaaaa +aaaaaagctctaccTCCTGCTTGACTGGCAACACAGGATGTCTTGTGCAGCAAGTATAAAGgacaaaaacacaaacatat +aaCACTTCCCAGTGGGGCTTCCATCAGTCCCCAACCGGACAGTGCCCCCTAGACCATAGACCCCACAGAGCATAACACTG +AAGCACAGGATCCCAGATACGGCTCTCccaaactttgttttgtttagtttaacATCGCCAACTCACTGTTTTGTCTTAGA +AACATGAAATCAAACAGAatgagaatataattttttttggagttggagcAAATCTAGATAAGGGTCACAATTGCCCCAAG +ACTTGCCCAATTTGAGCTCATGCTACACAATTAAATATATTCCTTAAATATTGTGTGCTTGCAACTTTTGATTTTGTAAA +TGGGTTCCCACAGGACAAAAATGAgtcttaaacataaaattaaaccaTATGAATGTTAGTTTTTTAGTTAATTTCAGCTG +GGTGGtagtctatagtcccagctactcgagaggctgaggcagaaggatcgcttgagcccaggagttcagggctgcagtga +gctgtgatcacaccactgcactccagcctgggggaagaGCAAGACCctacaaacaaaaaagagttaattccattatattt +atttttacatacccAGAGTTTGACTAAAATATACCAGACAATCTCCTGTCCCCAAATCCATGTCCAAGCAAGGGGAGCCA +CGTTTTCTAAGCTCACAGTTTAAAGGTTAAAGAGACACACTGAGGAAAACTCAGGGAAAGAAGCTGATTTGCATGGACAC +TAGGCCTGTCGTTGATTCCCTCATTTCAAAAGTTGCATGTGCCATGGGAGGCGGAGTCTCTGAGGACATCCTGGCTGTGC +CTCGGCGGCTCTGGACTCCAGCTCTACGCAGAGCGCCTTAACACTGTACTGGAAGAAATGGGAAATCGCATTGAGGACTT +ACAGAAGAATGTCAAGGACTTAATGGTGCAAGCTGGCATTGAAAATTCTATTAAAGAACAAATGGTAAGGTTATTAGCAA +ACTATGTCAACCGTTTTCGTATGTTTTGACTGCTTTCATCTTGAAACATACAATCCCATATTTGTTCACTATGGGAATTG +CTGCCTCTGTGCTGCATGAAAACTGCTCAGGAATAATAAATTTCCAAatgctctcttctttttttgaggtggagtcttgc +tctgtcgcccaggctggagtgcagtggcatgatcttggctcactgcaaactccacctcccgggtttaagccattctcctg +cctcagcctcccaggtagctggaactacagacatgcaccaccacacccagctaatttttgtattttagtacagacggggt +ttcactatgttggccaggctggtctccaactcacaacctcaggtgatccacccaccttggcctcccagtgctgcaattac +aggcgtgagccaccacgcccggccacacccagctagtttttgtatttttagtagagacggagtttcaccttattggccag +gctggtctcgaactcctgacctcaggtgatccacccgcctcggcctcccaaagtgcctccTCTGCAGGGCATCTGAGGCC +ACCCAGTGCAACCCGAACCTCATCAGGGCACTGTAGGCATTCTCACAGCTTCTGATGCTGAATGTAAACATGCCACAAAA +GAATGCTAGGATCAGAGGCATGTTTAAAGCCATtagtaaaaataacagaaaaaatgattcagaagccaggcgcggtggct +cagcctgtaatcccagcactttgggaggccgaggtgggcggatcacgaggtcaggagatcgagaccatcctgtgaatggt +gaaagcccgtctctactaaaaatacaaaaaaaaaaaaaaaatagccgggtgtggtggcgggcgcctgtggtcccagctac +ttgggaggctgaggcgggagaatgccgtgaacccgggaggcagagcttgcagtgagccgagactgtgccacggcactcca +gcctgggcgacagagcgagacaacatgaaagaaaagaaaggaaagaaagaaagaaagaaagaaagaggagagagaggaga +gagaggagaggagagagagagagggagggagggagggagggagggagggagggagggagagagagagagaaaggaaagaa +agaaagaaagaaaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaaaaaaa +acgatGCAGAATAGCGCCCACGTTGAGCAGGTCCTCCCACCTGTGCCCACGCGGATGCTTCCCCAGGGATCCTCTGAACC +TCCAGGTCAGAAAGCTGGTGAGCCGTGAATTCACcctgcattttaatttttgggTGTGGTCTGTTCAGGTAGGCAGAGAT +GGCAGTGCTCTGATTCTGCTGGTGGACTGAAGTCCTCATCCCTTCTCTGCGTGCGCTCAGAGCAGAGGCAGTCTTCCCCC +AGACACCTAACCAAGTTTCCTGCAGACACACTGGCATCATAACCTTCAGACGTTTATTCTGAAAGTCTTAAAGCAGTATG +TCAGTGGCCCCATACTCTGTAGCCTTCTGCCCCTTGCTTTTTTGTTCAGTACTGAGATTCATGTCGACACGTGTCGCTCT +AGGTCAGTCGCCAACTGCTGCCTGATAGTCCTCTGATGATTTACTTTATATTTGACTAATATTGTTTCCAAACAATGCTG +CACTATTCTTGTCTACTTCTTTGTGTACACGTTGCCAGTTTCTCTAGGGGACTTGGTTATTCTGTACCTAGGGGCTTGTC +ACCTGCATTAGCTGCCACAAGTAATTTCTGGACTTTGCCTAGAAGCCATCCTGGGGAGATGGGAAATAGAATTCACTAGG +TACCCAGAGAGCCACTAGTATATCCAACTTGATTAAAGCACCACAGTGCCACAGACGAGGCAAGGCTTGGCAGCCATGCA +GGCCTAGGTTTCAAATTCTTATCCTCTGTTGCTGGCTGTAACCTGGGGAAATCTAagctttagttttcttctctGCCAAA +TATCTACCTGGTAAGTGGTTGTAAGGATTTAAGAAAATTGCTTGCAAAGCACTTAAGTCAGGGCAAGGTAGACAGCAGGT +ACATAATGAGATAAACTTGAAATACAGAGTCCAAATCATTATGATTTCAACTGTACTGCAGCCACAAAGCACCAGTGAGA +ACGTTATTAAAATCAGGGGTGAACATACTTAATACACCTGATAGGAGTTACGGCCTGAACGTTTATGTCTCCCTAAATAG +CTGAAGACCTAACTGCAGCAtgtctgtatttggagatggggcctctacAGAAGTCATTAAGGTTACATCGGtcctgaggg +tggggccctcatccAACAGGATTCGTCTTTCTGAGAAGAGACGCAAGGGGCTCGCCTGCTCTCCCCTCCGTGCCTGCACC +AGAGAAGGAGGCCATCGGCAAGCCAAGGAGAGCCCTCCCCAGAAATCACACGGACCagcaccttgatctgggacttccag +aaaacagaactgTGAAAAGacaagtttctgttgtttaaaccacccaatctgaagtattttgttatggcagtcctgggcag +actaatacaatatgcaACACATTtacaataaatatacaataaacttACAAATGtggaatgtaattattttatctttttct +ggtAACGCCAGTAAAGCTGTTATCCTAAGAGACATAATTCACACATCATCCACCAATTATGCAATTGACAGAATTGTACC +ATTATCACAATCTTAGGGCACTTTTTCTGTTTATGCCTGCTCGGCTTATGTAAGCACATTTTCATtacatcaaaaataaa +actgcCCATTAGCTGTCACCTCctatttccccctccctcccagtCTTAAGCAACCACTAATCCGTCTCTATGGATGTGCC +TATTCTAGAAATtccgtataaatggaatcacacaatatgtagtAACTGGCTTTTCACTTACAGTACTTCATTCtgtttat +gactgaataatattccgttctATGGATGGACATTTTGTTTAaccgttcatctgttgatggacacgtgggtttccacattt +tggcttcTGTGTACGAGTTTCTATGTGGACATATGCGTTCAATTCTCCTGGGTATgaaactgctgggtcaaatggtaaca +ccatgtttatctttttttatttttttgagacagggtcttgctcagttgcccaggctggagtgcagtggcacaaccatggc +ttactgcagccttgacctctggggcttaagtgatcctcctgcctcagcccgccgagtagctgggcccacaggcacgcacc +accatgcccagctaattgttctgtattttttgtagagacagggtctcaccaggttgcccaggctggtcgtgaactcctgg +gctcaagcaatctgcctgtctcagccttccaaagtgctgggattacaggcttgagacacctgcacccggcccatgtttaa +ctttttgacgACCTGCCTGCCTGTGTTACCACTTtgccaccagcagtgtgtgagggctccaatttctccacatccttgca +aaCGCTTGTTGCGATCTTTTTGTATCTAGCCGAACTTGTAGGTGTGAAGTGgcgtctcactgtggttttgatttgcacct +TCCTTAACAAGCAGTGATGCTGAGTGTCTCTTCGTGTGCTTCCTGGCCGTCTGTTTCACCTGCTCTGGAGAAATAGCAGG +AGTGTAATACTGAGTGTCTGTTACATGCCAGGTACTCGAGGTCACATACACGTTCTCCCTAGCCCCTCATTCCGATACGC +TGTTGCACTCTCCTACTTTATACATAGAAAAATCAGCACTTGGGAGATTAAACTGCAGAtttgcatttgaattttaaatc +catttaaaaaGCACACACTCATTTGTACTGTATGTAAAAAATGTGCTTCATTCTGTAAGTACCCTCCCGTTTCCACAACA +GCAATCAATCTCCCGACAGCAGCGAGCCCTGTGCACCACTCCTGTGCCCGGCAGGCACCACGTGCTTCTGCGTCCACATC +TGTGTGCAGGGCAGACACTATTATCCGCCTTTACCAAGGAGGAAACAAAGCTCAGAGAAGGTACCCTGCCCAAGGTCACT +TGGTGGGGCAGAGCCTGGGTCTGAGGTCAGAGGGCCCACGGTGTGCTCAAGTGCCCCCATCCCTGCCCTTCCTCACTTTC +ACTTCAGAGCTCGAGTGTGCTGAGACGGGACACGAAGATCCCAACCAACCATCCACACTGCTAGGATGACAGAGGGCAAG +AGAAACCTGAGACTGTAATaaggacattttcttttctttttttgagacggagtcttgctctgttgcccaggctggagtgc +agtggtgtgatctcggctcactgaaacctccacctcccgggttcaagcaattctcctgcctcagccttccaagtagctgg +gattacaggcacgcgccaccatgcccagctaatttttgtatttttagtagagacgggatttcaccatgttggccaggctg +gtctcgaacacctgacttcgtgatctgcccacctcggcctcccaaagtgctgggattacaggtgtgagccaccgtgcctg +gccaataaggaAATTTTCTAACTATATGAACTGAAGGGCCATGGCTTCTACAGAAAATATGTTTACATCAAATACAATCT +tgggaagaataaaaaatagctcTCCTATTCCTTACAGGGAAGgctataaacaatattttattgtactgtttTTATAACCA +GAGTAAACCTTTGGATTCTGTCATGGATTGAATTGTGTAACcgcaaaatttatatattgaagtcctaacccccagcaccT +CAGACTGTGACTTTTGTGGACATAAGATCCttgcaaatgtaattagttaagattaGGCCATGCTGGAGCAGGGTGGGCCC +CAATCCAACACAGCTGGTGTCACTACGAAAAGGGGAAATTTTGGACACCAGCACCCACACAGGGAGGACACAGGTGAAGG +CAGAGATGGGGCGAAGTATCCACACACCAAGGAACTGAAAATACCACCAGCAGCCCCCAGAAGCCAGGGACAGGCCTGAG +CAGATTCCCCTCACGGCCTCAGCAGTGACCCCCCACCCGcgacaccctgatctcagaccCCCGCCTGCAAAGCTGTGAGA +GCACACATCTCTGTTAAAGCCCAGCTCGCGGCATATGCTGTCACCGCAGCCCCAGCAAACCAACCCAGTATGCAAAACGC +ACAGGCTCACAGGATAAACACCTTCGTTTTACTCCAAGGGTAAGAATTAACTTTGACTAGGAAAATCAGTAATCTATTCA +TTAAGTTTCCTGAGAACAAACAAGTAGGCCTGCTCCTCTCACCACGTGCTTGTTTATTTCGGTGAGTTAAGACCATGTTA +TCAATTCCATCTCAGAGGTGCTCCAGCCCTGGAGCTTCCTGTATTTTCCAAAGGCTTTAAATAGCTTAAAACGTTTCCAT +ACAAAAAGGGCTCCACGACATTTATCCGCGCAGACTGAGGGCGCCTCAGTAGCGGTACTTGGTGGAGTAGTCCTTGGGGG +ACACCACCAGGCCGCGGCCTTTGCGGGCCCCGCGGTACACCGTCTCGATGATGTCCACCATCTCCTGCTTGTCCTCCATG +GCCCAGTTAATCTTGTTGTTGTTGCCAGTCCCCAAGTCAATCATGATGTGCTTGTTCCTGCAACGAGAAACAAGGGCATC +CATTCTCATAgaagtctctttaaaaaagaattccttGAAAACAATGCCGTATTTTTCCCACTGTGACAAGCTCTTGTTAA +CATCACTGAAGAACGCATAAAATCGAGAGTGTATGCCATGATGCAGGAGAATACAGGAAGCAgctaaatgtgtttttaaa +gatGCAATGCATTGTCCACCATGCTCTTTCctgcatttttctgtatttatttaggaCAGTTATTCTAAGGGGAAATTAAA +AGGTCAAAGgagggcaggcgcagtggctcatgcctgtaatcccaactctatgggaggctgaggcaggaggatcacttgag +gcccggagtCCGAGATCAACATGGATAACAAAATGAGACCCCCATCTAATTTGTACTAATTTACCTGATAAAAGCAATGT +GTAAGTTTTTCACAGTGCTCTTATGAGCAATGACATAAATTGTATTAAAAGCCCTTCAACTATGAAATGAAGATGACATT +TTGCATGTTTATGGTTAGGGAAGTAGATAATTCTTTACTTATCTTTCAAGAGACTAGTTCTTTTCCTCATGAATACTCAT +GTGCACTTTACAAGATaaggatattaaccctttgttgTACTGGCATAAATTTATCCTGGTTTGTTGaagtttctttttga +gatggggtctcactgtcatcTAGGCTAGGGCATAGTGGCAAAATCAACCTCCTggcccaaatgatcctcccacctcagcc +ctctaagtaactgggaccacaggcacgagccaccacacccagataaatttctagttttttgtaaagatggggtgtcccca +tgttggccaggctggtctcaaactcatggcctcaagtgatcctcccacctcggcctcccaaagtgctgggattacaaacg +tgagccactgtgccggcctgattatttttcttagagatgcGAGCTATGTTGCTAAGGCCGGACTCAAACCCCTAGCTCCA +GTCCCACCCCGGCCTGCAGAGTGGCTGGGCTGCAGGCATACCCCACCTAAAGACTGTTTTTTAACGTTCACTTCCACAAG +ATCATGCATTTGGCTTCCACCTACTATTCCGTAGAAAGTATTCCCCGTGAAGTCTCTGGTTCAGGAGAGGCAAATCTGAT +CAGGGTCTGGAGCCCACCTCTGGCCGCCCTCTCATTCTCCATGCTGCAGGCAGGCCCTACATGCAGCATGCACCAAGCTC +CTTCTACAATGACTGGGGGCTGCCCGTAGCACCCTCCCTGCAGCCCTATCTGACCTGGGTGTACTTCTTCGGTATTTATA +TCTCATAAACATCTGCCTCCTTTCACTTCTTCAGAACCCCCTGCTAATTTACTCAGCTGTTTCTATCAGACAACGTCCTA +TGAGAGTGGGAATCCTGTCTTATCTGACATTAGGCTCCACCACTAGACCACAAACCTGGTGGGGGCTCAAGATATATGTT +TGGTGCATAAGTTAACTAAAGTACCGTTTGTCTTAAAATTCTAAATCACAGAAACAGCAGAAGGAAGTTTCATCTCCTTT +TCTGAGTACAGCATCAGTCCTAAAAATATCAAACTACACTAAGTAGGCTGCTCCCGCCACCCAAGGCTGTTGCCTGCAGC +CTCCTTGCCCCTCACAGCATCTCCagccccaggctggcctccacTGCACGCTGTGGGGGGCAGAGACCTAAGACGGCCCT +CCCTGTTCTCACCTGAAGAATTCTGAGGGGCAAAATAGAAAAGGCCTCTATTGCCTCGAACAGGCTGTTAACAGAAATAT +GGACTTCAGGAACACCTGGTGTGGGCTTggaaggaagtgaggagtgtGTTACTGGGAAACGGAGAAAAGGGGACCCCCGT +TACAGAGTGGCCGAGGAACAGGGACACGGGGTCCGCAGTTATAGGAAGTCAGAGCTCAAAAGCAACACACTCGGGATATT +TAGCTGCGGAAACTTCCAAGCAAAGTCTCGGAGGTGCTGCCTTCAAGAGAGGAAAGAGGCAAATTAGTGTAAGAACTGTT +AACAAAAGGAATAAATTCTTTAGCCTCTCCGGATGGCCAAAGGGGCTGAAATTCAGAAATGGCTGCTGAACACACAGTAG +AGAAAAGGATGAGGGTCAGCCACGCAACCTTTGCTGAAACCTAGGAAAGATGAAAAGTTCACAAGAAATGCAGGGTGCAC +CTCACAGATCTTCTCAACCTAACAGGAGGCACCTAGGAAGCCAAGGGCGTGTCCAGGGGAGCGTATCCCGACGCCCAGAA +CCACGGCAGTGCCTCACGCTTTGGcagcctgaggcgggaggatggctcgaagccaggagttcaagcttgctgtgatggtg +ccactgcactccaccctgggtgagagtgagactccatctcaataaataaataaaacaaggtcACTATCCTGGATTATTTG +GGCAAGTCTAATCTTATCACAGTCCTCTAAAACCAGAGCATTTTCTCAGGCAGAACTTTTCAGGTCAGAGAACCTGTGTG +AAAAGTTCTCAATGTGCCGGCATGCCAGTGCTGGCTTTGAGGGTAACTGGGTCTCTGGCAGCAGAGTGGCCTCAGGTAAC +AGCAGCAAGGAAGCAGATCTCAGACCCACCACCACGAAGAGCCAGACTCGGCCCACAACCCAAACAAGCCCCTCAAACAC +TCCAGATAAGCACCCAGCCCATGGTTCAGaccttgtgagaccctgagcaggaATCCAGCGGGCCCTGCCTGGATGTCTGA +CTTACAGAACTGTGAGCTCAGAAAAGCGGGGTGATTTAAGCTGCTCAATGCGTGCTAATTTgttacaaagaatacaaaac +taACGCATACCCTATACAACCCACCTGACCTTCCTTACCGAGTAGGAGGCTCCACTTTTGGAGAATGGTCTAATGCTCAT +GTAGCTAATTCACCAACCTCTATGGTAACTAAACAAGTAAAAACAGCATCAGATGGTGATTTCTTTCCCCATTAAATTTC +CATAATGAAATTTTTGAGCTGATAAAATAATCTTGTATTTATAAGCAAAGTAGTTCAGGTGTTCCTCTTGGATATAAAAA +AGGACACCGAAGTTGATgataacatatttaataaattaggtttttgatttttttttttgagacggagttttgctcttgtt +gcctaggctggagtgcaatggcatgatctcggctaactgcaacctccacctcctgggtacaagtgattctcctgcctcag +cctcccagtagctgggattacaggtatgcaccaccacgcccggctaattttgtatttttagtagagactcggtttcacca +tcttagcccagctggtctcgaactcctgaccttgtgatccacccacctcggcctcccaaagtgctgggattacaggcgct +ttgagccactgtgcccggcctaaatttgTGATTTTACAACATTTTTGATGAACAGATTATAGTTCCAGGTATCTAAAATG +TCAGTAACAACATCTTTCATTGAATTACACTTCACCTAGGTTTCCCCGCAAAGTTATTTCAAAAGGGTAATTTGTTACTG +AGTAAATGCTAAAACTAGAATTgttactttgaaaattatttcaaagattcACACTGATTGCCTGGTGATACAAACGATTT +TGGGACATGACTGTACAACCAActtccttcaaaataaaataatctaaagaGATCTTGATTACATTAAGCTTCCAAACTGA +CAATATTCAATTTCAGTAACAACTTTAAGATAACGTACCTGAAGAAAAACATGACAGTACATGGATCGTATAACTCATAC +ATTTTGTTGAAGTCAGGCACTTCTGTAATATCCACAAGATAAATAACTGCAAAATTTTTAACCtaaaaggagattaaaaa +aaaaaactgaaataacagAACACTTTGGCTTTCATTTTATAGAAgcacatttataaaaacaaaaccaagaattTGTGTGG +ATCAgggcaatttttgttttgtttttttgagacagggtctcactctgtcatcctagagtgcaatggtgcaatcagctcac +tgcagcctcgacctcccaggctcaagctatcctcccacctcagcctcccaaagtgctgggattataggagtgagccatca +ctcctggccaACGGGGCAATTTTCAAACTAATTCTTCAACATTTTTTCCCAGTGCAGCAAACACGCGGGAAGATCATGTG +ACACTACTGGGACCTTGGCCACCGGCTCCTTCCGCCTCACGCCACCGTTTCCCAGAATAGTTCAGGAATTGGCCGCTGGC +TCCTTCCGCCTCACTTTACCATCTCCCAGAATCGCCCAGGAATTAAATTTCACTCAGGTAACAAGTTATCAAAGTCCTGC +AGAAAACGTCCACAGACATATTATTTATGCTGATCTAGTAATCATCTGTATGAACCTTTACAAATAGTTACATGCATCTC +ACAACTGCTCTAAAATATACCTGTAGAATCCTGCCATCATTTCCCCGCAATTAGGTCTGAACACTTGCAAATATTAATAG +TACCTCCAATAATACtgcatataacacattttctgttattttttctttaattgcaaGTTTATCAGAGATTtagaaaagtc +agaaaaataaaaagaaaataagatcacCTCCAATCCCAACACCTAGGGGTGACTGCTGCTAtcttttttccagagacagg +atctggttctgctgcccaggctggagtgcagtggcaagatcttggctcactgcaacctctgcctcctgggctcaagcgat +cctcccatctccgcctcccaagcagccgggactacaggcacgtaccatcacacctggctaatttttgtattttttgtaga +gacagggttttgccatgttgcccaggctagtcttgaactcctggactcaagtgatacgcctgccttggcctcccaaagtg +ctgcgattacaggcatgagccaccacgcctggcaatatttttactatattctttcccctttcttatagatatatattttt +taatttgttcataTTATTTCTGTTCAGACATAGCAAAATATGGTCATATTGTGATCGTACTACAATATTTTGTATGCTGC +TTTAATTttgagcattcttttttttttcttttttttgagacggagtctcgatctgtcgcctggtctggagtgcagttgca +tgatcttggctcactgcaacctccacctcccaggttcaagctattctccagcctcaaactctcgtgtagctgggattaca +ggtgtgtgccaccacgtctggctaatttttgtatttttagtagagacgaggtttcaccatattggccaggctgatctcga +actcctgacctcaggtgatccacccacctcggcctcccaaagtgctgggattacaggcgtgagccacagcgccggcCAAT +TAAGAGCATTCTTTTACATACATTACCTACTTTCCAAAAATGGCAAACTTCTATGGCTGCGTGATTCATCAAATGGCTAT +GCCATTACTTTAccatttgatatagtttggatatttgtccctgcccaaatctcaggttgaaacGTAACTCCCAGCGTTgc +aggtgaggcctggtgggaggcgactgGATCATGGGATGAATTTCTCATCAATGGTTGAGCgtcatcctcttggtgctgtc +ctcggggtagtgagctgaggttgttCCAAGTGTGTGACACTCCGCCCCAGCTTTCACCGTGCGACGTGCctgttcccctt +tgccttctgccatgattggaaggtcctgaggcctccccagaagcaaatgccagtaccacacttcctgtacagcctgcaga +actgtgagccaattaaacttcttttctcatAAGTTACCCggtctcatgtatttctttatagcaatgcgagaagaGCCTCC +TACACCATTCTTCTGTTACTCTCTGATTGTTCATTAATATTATTCATCTGTGAACATTTTCCCCCTTAAATTTGTATACA +TACTTCATTATTTCCTAAAGACAAATTCCTAAAATTCTGGAACAAAGGATATGACATTTTAAAGGCTCTTTATAAATACT +AACTCAATGCCCTTTAGAAGCTTGTAGCCACTTACATGTGTTATGAGTTAAACTGTGTCCCCAAAACAGTTGAAGTCCTA +AACCCCCAAAGTGACAGCGCAGCTTATTTGGAAACGGTACTGCAGGCGTAATCAGTTAaggtaagatgaggtcatactgg +agtaggtgGGCCCCTAATCTggtgtgactggtgtccttagagGAAAATGGCCACATAAAGACACAGGGACAGGAGGCTCT +GGGGGCATCGAGGACTGCAGAGGCGTCTACAAGCCTAGGAGCTCCAAGAGGCTGCTGGCagcaccagaagctgggagaag +cATGAGGACTCTCCTTGGCAGGTTTCAGAGAGAGCCCAGCCTGGAAAACACCTCGATTCTGGACTTCTGGtatccagaat +tgtgagagaagtttctgttgttttaagcccctcAGCTTGCAGCATCCTCTCACAGCAGTCACGGGAAATCAATGGGATAT +GCACaacatttcttcatataaaactaaatCTTTACAACCTGACTGTAACTTACAGGTGAAATAATTATATCTGGGATTTC +CATGAAACACACAGCCTAAAAAAATGAGGGGTTAGGTAACACAAGACCAGCAAAATACCAGCAATTGTTGAAGTGGAACA +ATAGGAACAGACGGGTTCACTATATTCTCTactttgggaaatattttttaaaaattaagatggaaatgtaaaccatggag +cagggtggggagagggggagggggaaagcCCACACTTCACAAAGACTCCACTGCCCTTCAGCTTTGGTGAAAAAAACCGT +ATCAGAACTCCAGCACAGGGCCCACTGCAGGAGCGTGCGCCTCGCCATCCATGTGCTTTCAAAACCCCAAGGTGAGCACA +CGGCCCACTGCAGGAGCATGCAACTCGCCATCCATGTGCTTTCAAAACCCCAAAGTGAGCACACGGCCCACTGCAGGGGC +ATGCGACTCCCCGTCCATGTGCTTTCAAAACCCCAAAGTGATGTTTGCATTAGACATGGGCCCCTCTGAGTCCGGTCACT +CTGACAACGCCCATTTCTGAAACTCTAGTTTCCCGCAGCGTCCACAGTGAGAATGTGTTGGTCAGGTTGCCAGAATCCAG +CTTTATTTTCATGGTAAACTaataaactgtcttttgatttgaCTTAAACACAAAATACTTCCCTACAGAAGCTAAAATAA +ATACACCAACAAAATAAATGGTCACAGGCATTAGGAAagtcaaaataatatttcattattctcCCTTCAATGCCTTGCTC +AGGGACAGATACAATAAATACCAGTAACATCATATTAACTTACGGAATCACCATGCAAATAATAAATTACTTGAAATATC +ATTTGGACAGGGTTGTAAATTGTgtacttaagaaaaaagaaaggtcacAACAAATTTTTACCATCCCTTTTCTTGCAcgt +aaaaaatagtaaatattataaGCATCTGTGGTTTTTAATAAtttgtaggcatttaataaacagAGTgcagggccaggtga +ggtggctcacgcctgtaatcccagcactttgggaggctgaggcgggcggatcacttgaggtcaggagttcgagaccaacc +tggccaacgtggtgcaaccctgtctctactaaaaacacaaaaattagctgggtgtggcggtgcacgcctgtaatcccagc +tacttgggaggctgaggcaggagaatcacctgaacccaggatgtggagattgcagtgagctgagaccgtgccactgcact +ccagcctgaaggacaaagcgagactccttctcaaactaacaacaacaacaataatcaGAGGGCAACTGAGTAACTGTAAC +TTTAAAGATACAAATGTGACTCTCCTTTTGCCTTCAAGCTCCGGCCCACTGCATCTAACCCGGAGAATTCTAACAGAGGC +AAACTACCACATAAAGCTCACGTGCCACGAGAACGTGAACATCCTCTGATTTCACCACAGACCAGTTTCATGAACACTTG +AGAATCCACCATGTATGAGGCACTTTTCACTCATGGATTCTAATTCTTTCAACACCTCTAAATAGATGTCGCCGATGTTT +TAGGTGATGGAATTCAGGCTCAGAAAGGTCACGTGAGTGATGCCCAGCCGCACGGGGAAGGGGCAGAGCTGAGATGCAct +ctgtctggctccaaagcctgaACTCTGGATGACAGAATCAGCCTAACCCACAGGAGACGGGGCGGCAGTGTGGTCACGGG +GTGGCAGGGAAATATCTGTCCTTAGAGAATTCCATGTGTAAAGAGAAAGACACAAGACCAGCAGCCAGGTCTCAGATCCA +GCACCAGGGCAAGGAAGCAGAAGTGCAGGACAGTGGGGACCGATGCGCAGGTGGACTGGCCCACAGAGGACCTTTTCCAC +TTGGGATTCTGGGCCATGTATTTATTTCTAAGAAAGCAGGAATTGAAGAGTTCAGTTTTGTATATACTGAGTTTGAGAAA +ACTAAGGGACATCCAGAGAAAAAGGCCCTCCAGGTAGCGACTGGCCTGGAGCTGTACAATGGAGTCTGGGCTGGAGAGAG +CTACCTTACTGCCGCCACCTACATCCAGCAGTGATCCCAGCCAGGGTGGCAGGTGCCAGGCTTGGGAGTGAGTGCAGAAG +CCCACTGTAGAGTTGGGGGCAGGAAAGGGATGCCAAGGAGGCAGTCTCAGAAACCAAGGCAGGAGACTCCGGGGAGCTGT +GCCGAGTGCCCCAGAGAGGTTACCAGAAAGGACGATATGGGAACGGGGACACTGCAGGGGCTGAAGGACTGAGGAACAGA +GGACTTCAGTGTGGACAACTTTCAAAAGCCGTGGCtatggagagagagggaagttACTGTGGGACTTGGGAGTTCAGATA +GGAGAGGCTgaaagaggcaggcagaggggagctccccagcctcccaccacCCCATTTTCTTGGTGAAATGGCATCTGTGG +GGTGACTGGAGGAGCATGAGCTGAGGGCAAACATCAGCAAACATCCAGCAGCATCTGCTGGTGGCGACATTTCCACGGTA +ACCAAGCGCTTGGAGGAGGAGCTGGGTGAGTGCAGCCACACTGCTCTGCCAAGTCATCCTGTGACTTCCTCAACAAaggc +agctttttcttttcttttttagatgggatctcactttgtcacccaggctggagcagagtggcgtgatctcggctcactgc +aacctctgtttcccagcctcaagagatcctcccacctcagcctcccgagtagctgggattacaagcgcccaccaccacat +ctggctaagttttgtatttttagtagagatggggtttcaacatgttggccaagctggtctcgaactcttgacctcagatg +attcgcctgcctcggcttcccaaactgctggaattacaggtttaAGCCACGGCGCTGGcctcaaattgtttttaaattct +ttgattgCCTTGAATTGTAAGTATCTTTTCCAACTCTGTTGCCTATACACCTCATAAGCATGCTTTTTCTTTAGGCAAGT +CAGCAATTAAAGTGCTCAGCAGTTCTTCCGCATGCCTCTAAGTCCACCTGCCAAGTTACAATGACATAACTCTGCAAAAC +ACATGACCAAACTGGCACATGATACCAACCACTGTTCTACACTTTTGTATGGAGTCCCTCAACCAGCTAAGAAAACACCT +AAAATTCCGGAAAAAGTGTTCTGGGCAGTTGATTAGCACAACAAATTtttgaagtgtttatttttacatttttaggtaAT +TCATGTCAAAGCcataaaaaaaactgaaaaacttgaCTTTCTAAAATTAATGTTTGCCTTAATCAGCTCTCACAATAACT +TGCAAACTTAGTAATGTGAGAGAAGTAATCACAGTAATAACAGTGGCATTACTGCCCATGTGACAGGAGTGCTCTAGAAA +CCTTTATTAATCCCACCACAACCCTATGAGTGAGGCACAAGTATGCCCTTTTTACAGGTAAGATGACCCACGGGGCCAAG +TTCGTAACGCTCATGCAGTTATGAAGTGCCAAGAGCCTAGCTAGAGTCTGCACCCTCAACCGCCGCTCAACACCACTCCT +CATACCTTGTGGCTAGAGTCTGCACCCTCAACCGCCACTCAACACCACCCCTCATACCTTGTGGCTAGAGTCTGCACCCT +CAACCACGGCTCAACATCACCCCTCATACCTTGTGGCTAGCTACAAGGctgttttttttccagagacaaaaATCCAGAGG +AAACACCAAAAATATCCATTCTGTTTGTTTCTACAAGATTCCTCATCTAGCTGAGTGTTGACTCTCACTGGTTCAGTAGG +CGTTCTGTGATCCTTAATGTGATACTTAAGATACACCCCTTAATATGGCTCCTATGAAgttccataaatatatatctgCC +TTACACACAATTTATTAGAATGTACTatcatgggccaggcatggtggctcacacttgtaatcccagaattttgggaggct +gaggtgactggctcacttgaggacaggagtttgagaccagccggccaatatggcaaaaccacatctctaccaaaaataca +aaaattagcagggtgtggtggtggtgcaggcctgtagtcccagctactcaggaggctgaggttgcagtgagccaagatcg +tgccactgcactctagcctggacaacacaagaagactcttgtctcaaacaaacaaaaaaaatgtactATCATGAGTCAAT +AATATTACtgaggaatttttttgttttgtttttagagatggggtctcgctctatcacccagggtggagtgcaacggcacg +atgatagcttactgcagccttgacctcctgagctcatactctgtcttagcctcctgagtagtggggttacaggcacatgc +tacaaTGCCCAGCTCACTGAGACTATTTTAAACTCTATTTGCGTGGGCtttttaacacaaaaaataaacgTCCCATTCAG +TCTTCTCACCCATCATGGGATTTAGGTAACAGTGAACCCCAAGCACTGCTGTGAGTGTTCCTACAAAATGGAAAGACAGC +TCAAAGGCACATGAAATTGTCCCCTCTGAGAGCGTTCCTACAAAATAGAAAGACAGCTCAAAGGCACATGAAATTGTCTC +CTGTGACGGCGTTCCCACAAAATGGAAAGACAGCTCAAAGGTCCATGAAATTGTCTCCTGTGACGGTGTTCCTACAAAAT +GGAAAGATAGCTCAAAGGTACATGAAATTGTCTCCTGTGATGGTGTTCCTACAAAATGGAAAGACAGCTCAAAGGTCCAT +GAAATTGTCTCCTGTGACAGTGTTCCTACAAAATGGAAAGACAGCTCGTACATGAAATTGTCTCGCTTCAAAACTGCATA +AAATCAGCTGCtaaatagaaaactttttttttttgagacagggtctcgctctgtcacccaggctggcatgcagtggcgcg +atcacggcttgtcacagccttgaactcctggactcatatcctcccacctcggcctaccaagtagttgggaccacaggcac +atgccacaacgcctagctaatgtttgcattttttgtacagacaaggtttcaccatgttgcccgggctgttctcgaactcc +tgggctcaagggaaccacccacctcggcctcccaaaatgctgggaccacaggcgtgagacACAGTGCCCAGatgaaaact +ttatttattattatcactTTATCAAATGAATTGCCCCACATTTATTAGCAAACTCTAGGCATCTGTGAAATCTTTTAAGC +ATAGCTCTGTTTCCAAACTGACTTTATTCCTGAAGTTTACAGCTCAACTCTAATCCAGCATGAAGTCAAGTGGGCCCTGT +GGAAGCAGCAAATCAATACAACACATCTAGGAAAAATCATATCAAGTCCAGCCTGAACATTTGCAACTGTTACTCACTTT +AAAACAGCTGGAATATCCAGACAGAACTCGGAACAAGGTAAGGAAGAGTCCCCATTATCTGCCATTTCTGCacagggaaa +tttttaaatagttatattGTTCTTTATACTAAAAccttatatttcaaaaatttctgtTGAGCTTCTCCAAAAAAGGGCTT +CCAGAATTAGGAAGTCATATTTGCCCATagactaagaagaaaataaaacatgaatattCTTTTAATGCATTTCAggtgtg +cactttttttttttttggagacagagtctcactcctgttgcccaggctggagttcagtggtgtaatcatggctcactgca +gtcttgacttcccgggctcaggtgattctcccacctcagcctcccaagtagctgggactacaggcacgtgccaccacgcc +cggcatttgtacttttagtagagaaggggtttcaccatattgtccaggctggtctcgaatccctgggctcaagtgatcgc +ctacgctggcctcccaaagtgctgggattacagacatgaactaccgcgcctggcctaggttTGCCCTTTTAATAACTATA +TTATAATATTCGCAttaaaagtttattaataaaGCTATCCAAATTTttggacaaaaacaaaaatcagactattttaaaag +accaaaaaaaatacCCCAAAGTGAGCCCATTTCAACTGTCCTTAAGTGTGTGTCCACTATGTGTGTCCACTAGTGATACA +GTGATTCATTCTGTAATAACAAAGAAATTGGACATAAAGGAGTTGGAAAAGAGCAAATCTGGCAAAGACCCTAAACACTT +ACATTAACACATAAAATCTGTTTTCTAGCTAAGAGCATCTTGCTGAGCTGTGCTCGGATGTGATGAGACAAAGGCCCTCA +CATCTCCAGCCATCCTAGCCTCGAGCTGCCGGCAGCAGGATCTGAACACCACCCGATGTTTATGCCACCACCTGCAATGT +AGAACAGTGAAAGATTCTAGAATTCATAACTCAAAGTCAATACAGCAGTGATTTACTTATTGGGAAACTGCTCTTAGTCA +ATTCTTCGCTGGATTTCCAGTATCTGCCCTTGTCCACTACAGCTCAAGCAATGACAAACTGATGTGTGGCACTGCCTCTC +TCATCATCTCTAATCCCCGGCTGGCAGAACCCCCAGCGTGTGAGAGAACTGCTGCTCTCCCGAATTCCCTAGGTCCCTCC +TCAAGTCctcagtgtccaggaggaagaggctgcgAGATGGCTGGTACTAAAGCTACTCCACCTCCATCCCTGACCCCCAT +GGAAGGTGGTTCAAGGTCAAGTCGCATCAGAATTCCCTGAGGGGCTTCTTTAAAAAGCCATTGCTGGACTTCACCCCATA +CCCATAGAATCAGAATCACAGATGCATTCTAGTAAGCACTCCAAGGGAGTCTTGTAATAACTAAAATATAACAACAGTTC +CTAACTTCATGCCTCTCCTACTAAATaagaaatagcaaaaattaataaatggcaGGCCAGATAGTCTTTCCTTTCCTGAA +TATAAAATATGTCCTTACATCTGTTACTCTTTACTCAAAGGAGTTATTTTAACCCAGGCCTACTTTTTAGCATCTGTGAA +AGAAAAGATAGCTTCAATGTTACTTTTACTCTTCATTAAAAGGAATAACCTCTCAAATTACTTATGGAAAATTAATatgg +caaatattttattaacaccCTAGTATTTGTGCCCTTTGCCCTATTGCTACAGCTGCAAAACCTCAATAAAAGTCATCCTA +AAATTGCTATGAGGAACcacagaaaaagaatgaaatctggTTATAAAATAGTGTCCTACAACTTTTATTGTCAGCAAGTA +AATGCCAAGACCACTCAGTGTACTAATATTAGAAATTCAGAGAACTAATTCTAACAAAgcatatttctattttctcctgA +TGTTTACTGCACCTCAATCACTGTAAGCTGTAAACCACATGACTCACAAAAAGCGTGGTGGGATAAAAAAACACACCCAC +AGAAAAGCAGGATTTCTCATGCTATGAACATGAAAGTCCTGCTTTATGCATACAACGAATAACTCAGGTACCAAAAAAAG +ACGTCCTTTTAAATTAAGGATCCGCGCTCTCAATAAAACCTGCAAGTGCATTTTGTTCACTGAACTATAAGTTTGGGAAA +AACGCCTTATCAACAGAAATAATTAATTCATTTGAGAGGAGGGGGATCATAAAccctttaaaatatgaaaaactacAAGA +GCTCCCCGCAGAAAAAGGCGTGAAACGCCGGCACAGTCCCCGAGAGCGCTCGACCGCAGCGAGGGGAGGAATCGCCTGAA +CGACGGAGCCGCGGCCCCTCCTCGGGGAACAGCTCGCGCCCCCAGGCGACgccggcagggcagaggcgcggGGCAGATGC +GGAAGCACAGCCCGCAGAGCGGGAGAGTCCGGCGCGCTACCTTCTCGGCGATGCTGTACAGGACCTCGTCCATCTTCATG +CACGTAGGATCCCAGTCGTGGCCGAAGCGGATGACGACCACGCGGTCCTCCTCCGAGAGGATGGCCTGGTCCACCTGCCA +GCCGTTGTGCAGGTGCGGGAGCATGTACGACATGGCGGCCCGCGCGCTCGCCGCCGCCCAAGGCGGGGCGCCAGGGAGGG +CCCAGCGAGGTGGGCTCAGCCGGCCCCTCACTCCCCGGCCCCCGCCGCCCCCGGGCCCACGGACGAAATCCGGTCCCGCC +CGCACACGCAAACTCCGCTGGGACTGCCACCCGGCAGAACGTCTGGGCGCGCACGCACCGACGCCGTGCGTGCTGACGGC +ATGCGCGCGCGCTAGCGCCGTGCGTGCTGACGGCATGTGCGTATAGGCGCCGCGCGAACGTGTAGTTGGCCGGGTGGAGC +GGCGGCGACCTGGGCACCGTGGAGCGCCGTTGGTTCCGTCATAGCAGCTGTCGGAGTGGGGGTTCCTCCCCAGCGTCCAG +GCGGCCTGGTGGTCCTGAGAAGCCCCGGGCTCGCCGTGCCCTGCCCCCACGCTCCCGCCCCGAGGCCGCCCGCCGCCGCC +CGGGCTGTCCTCCAGCCACGGATGGGGACGTCCAGAAAGGCCCGGAATGCCCGGCACTGCGGCtcgttttcttcctttct +ggtgCTTGTTTCTGTGTGTTACTAAGACAGTTCATGTGTGTCATTTGTGAAACTTGATCATAAAAATGTGTCGTTGTTGC +CATACCCAACCACAGCAGAGTCGAGCAGCGGGGTAGGGAGGAAGCACTGGGCAGGCCGTCCTTGAGGAATGTCACCATCA +GGCCGGCTGCTGAGCTGCCTGTTGTAACCTGAAACCaggttttttaatttgtctttttataaagaaaaaaaatttttttt +tgagacgtgcagtgacgtgatcttggcccactgcagcctccgcctcccggggtcaagcgcttctcctgcctccgcctccc +cataagctgggactacaaggcgccAGCCCGGCTAGTGTTTTTTTGtacttagtagagacggggtttaaccatgttggcca +ggctggtcttgaactcctgacctcaagtgatctgcccgcctcggcctcccaaagcgctgaaaTTACAGGAagaagccacc +gcgcccgacctaaAGCCAGTTTTATTCCATAGCTTCAGCATAACTTCCACCTCCAGGACTGATCTGGCCCCCTGGCTTCG +CTCACCAGTCAGAGCTCCCCAAACCCTTACTAGTGCCAGTGAACTTTCTCAAAGAGAAATAagttaatatttctcttttt +aaaataaaacctctaACCTCTGTTCTTCTGAGAGAGCACTTTGGGTTCATGCTGGAGACTCCCTGGTTTGCAAACTGATA +TTGATAGTAAAACTCTTGTCTACTATCTAGCTATCCTGGTGGTCCTTTAGatgacaaatttaaaacaaatcacgcccggg +cgcggtggctcacgcctgtaatcccagcactttgggaggccgaggccggcggatcgcctatggtcgggagttcaagacca +gtctggccaacatggtgaaaccccgtctctactaaaaatacaaaaaattacccgggcatggtggcgcccgcctataatcc +cagctactagggaggctgaggcaggagaatcgcttgaacccagggggtggaggttgcagtgagctgagatcgcaccactt +cactccagcctgggtgacagagtgagactccatctcaaaaacaacaacaaatcactATGCAGATGCTGTGGTGTCTCTCA +GATGGGCTGTGGCATGGTGAGAACTACAGTCTGAAGAGCCTGCTGGCATGGGGATTGAAATGGGTTCCATGTTTGGTTTC +CTCTGGGACACAGCACCATAAGGTGCTTTGATAATATGCACCCACAATTTAAATACTAGAATCACaagtgcagctcactg +ccagaTTAAGCTTTTTACATAATACAGAATATTTCCCAATTATATACGAAATGCTTTATTGTAATGAAACAATTTCAGGT +TGGTAGGATATTAGAGATCATCTAaatctctttcattttataattgagtAAAGCAGAGGCACAGAATATAAGCATAGTGG +TTAACAGTGAGGGTTTTAGTTACATAAAGGCGTGTTCAGGTTCATCTGTGCCTTGCATAAGTTATCTCAacaaacttcag +tttcctcatccatgagATGGGATGATAAAACAAAATCTCCTGTACTTCATTGGCACCAGGTACACACTGGAGAAATCTGA +GCCCTTACTAAGGTCAGGCTATTGCCTTCTAAAAAATTCCTTACCGTCATTTTGGTAGTCACTCATTTCAGATACAAACA +CAGTTGCTTAACACATCACAAGCATTTTATTCACAGGAGAGTTGTTCCAAGCTAAGGCATAAGACTCAAATGTCcccaaa +taatttattgaaactcattttttattgtgttactTATAACAAGATTTATCGCCTTAATTTTTAaggacattaagtacatt +catattatgcaaccatcaccactatccatctccagataaatcttttctttaataaaaacttttggccgggcgcggtggct +cacgcctgtaatcccagcactttgggaggccgaggcgggcggatcacgaggtcaggagatcgagaccatcctggctaaca +cggtgaaaccccgtctctactaaaaatacaaaaaaaaaattagccgggcgcggtggcgggcgcctgtagtcccagctact +cgggaggctgaggcaggagaatggcgtgaacccgggaggcggagcttgcagtgagccgagattgtaccactgcactccag +cctgggcgacagagccagactccgtctcaaaaaaaaaaaaaaaaaaaaacttttgttgaGATCTAATCcatatactataa +aattcaccaaTTGAAGGTGAGATGGTTTTTCAGTGTTGTGGAACCAataccacaatctaattttaggaGAATGTTATCAC +CCCGAAAAGAAAACCTATACCCATTAACAGTCATTTCCCATTTCCCTCCCattccctcagcccctggcaatcactaatct +attttctgcctctatagatttacctattctagacatttcatatagaTGGAAACACAAAATATGGAGTCGTTTGTGACtgc +tttcatttaacataatgttctcaaggtccATATTGTAGCAGACATCAACAGCTAGTTCTTTTTAATTGCTGAtgaatatt +tcattaaataaaacaaaaaattagaaactattccattatatggatatactacattttatccatcagttgattgacatttg +ggttgtttgcatttttttactattatgagtaatgctgctatgaacattcatgtacaagcttTTGTGTGGACTTAAGTTTT +CATGTGGTAactgtttaacatttttaagtattgctaaattgttttccaaattggctacaccattttacattctcaccagg +aATGTGTGAGGGTTtcagtttttccacattcttgccCATGCTTGTTATTTATGTACCTCGTGAACcccaaaaatctgaga +caggtctcaattaatttagaaagtttatttttccaaggttgaggatgcatcCATGACActgcctcaggaggtcctgatga +taTGTGCCcgaggtggtcggggcacagcttagttttatacattttagggaggcatgagacatcaatcaatatatgtaaga +agaacattggttcagtctggaaaggcgggacaacttgaagcaaggACAGGAAGACTCAAAGCGggaagggggcttccagg +tcacagatatgTGAGAGATGAACAGTTGCATTCTTGTGAGTTTCTGATcagcctttccaaaggagacaGTCAGCTGTGCA +TCtgtctcagtgagcagagggatgactgactagaatgggaggcaggttgaaGGGACCCAAGATACTTTCCTTTCACAGCC +TTGTATTATGGTCATACTGGTAGGTATAAAATGATAAGgtggttttcatttgccttttccTAATGGTTAGTGACTGTTCC +TCAGTGGCCAGGCCTATACAAATCTGCCCCAAAGTCAGAGGAAGCTGAGACGCCAAAGAAAGGGACTGGCAAATCTAGTT +CTTTAGagagaaacatttaatagggacttatGTACAGAAGCCACATCTGTGTCTCTGGCAGTGGTGAGACAAGGTGGTGG +TTTCCCAACCATCacccccaggcccagggccttcccagacccagggcttacatactgtgaaaggaaaataaatcttgtga +ccccaaactcattaagccaaagggaaaagttaagctgggaactgggtcaggcaaacctgcctccccttttggttcctaaa +taagagGGCTGCAAGATGATAACCTACACGCCTCCCCcatattttgcccacaaggaaattcctggtGAGCTCCAAGATTT +CTGCCCTAAGGTGTTCCTGTTAAAATTTCGCCATGGTAATGTAAATCGATAGCTTGTCTTTACAGGTGCAGTCATCCCCC +TGCCTACCAGatacaaatgcatatctgattgttcccctgcccttttgttatgttttgtctaTGTTACCTTATGTAATAAT +GCAGATTCCTCTGCCCCATTTGTCTGTCatcttatgtaaaaaaaaaaaaaaaaaaaaaaaaatgcagattcactgaacCA +AAGGCATGAATGACTATTTTTCCCTACCCTCCTCTTATGTGAAATTTGTGTACTTCTCAatatcccaccctttccccttt +aaatttggagccctcaaaataatctttggagaaaggcGTAGACCTGTCTCCCAGGCACGAGTCCTTATCTTTGGCAGATA +AGCCTCCTGGAATGAGACTGGTCTCATCATTTTTTTCGACTGACATGTGGTAACCACTAAGGGAACCTGGGCGAAGGTGG +CCCAGCCTGCAGCAGCTCTCCTATTGGTGCTTGGTACTGGTCTGGGCACCTTGTAGCCTAAACTGGTAGGACAATTTGCT +GAAGTCTCGGACCTCTTTTTTCCAGGGATCCCTGATCTTCCAGTGTTTTTCAGTTGGGGGTCAGAGGTTTatttgctgtt +taaaaaaaaaaaaatctccttttttcctggaatttccaCTCACTTCCATCAAGGAAGGCAGCCTGTCTGCTGCTGCATCT +GTGGAGAGCACTTTTCAGCTTGGGCTCCTATCACTAGGTAAGGAATTGGTTTGGGATTTTGTCttgcaaattctttttaa +atgactaaagtTAGCATGAACAACCCACTGGTGTTAATTTCTGCTTACACTTGAGTGCTCAGAAGTCATATAATTTGCGT +GATCACTGTTAGTTTTGCTTAGCTGTTTTGTTGTTTCCGTCTTGTTGGGTTGTGTGTgttgggttgtgtgtgtgtttcag +tccTTTCAGCTACCGGATTTGACCAACTCCAAACCCTCTAGCTTATGAGTGTGgaattttctaaagaaataagAGCACTT +TACTCCCCTCAGCCTTTCGGGGCATTCTTAGGCAACTGAGAATCGCGTGAGGGTGTCTGGGAGGAAAGCACCTTAAGACG +TGCAACAGCTCTGAGCTGGTTTTCTCCCCAGAAAAACATGCTTAAGGTCTAATCTCATCTGGTAGGTGCATATAAGGAGC +TGACCCTTCCCACACCTGGAGCCCCTGACACACTTTACCAGGTAGCCGCGACAAGCGTGGACTGAACTGGTTCAAGGAAT +AATGGCCCTGAAGAGCCAGGTCCCCAGGCAGCACACTTTGGGTCTCACACACATCCCAACTTGGTTGAATCCGAAGGGGA +ACTTTAAATTATGGGGAATGAGGCCTCTAAATTAGCTAAAACCCCAGCAGCTGAGGAACATAAAGTTCCACCTTTAGAAA +CTCCGGCCAGGTACATGCAAAATACTTACGGCAAATTGTCATGCAAATATTTAACCAAGTGGACCACTATAACCAAAGTA +GTTCTAAGTTACAATGGCCTAAATGGGGATCTTTTGAGATGCCCAATTTAGTGTACCCGCGAACCAGAATGGAAAACGCG +GGCACAAAAACTAAGAAACCGGAATGGGAGAGCTACTTTCTTGGTACCTGGAGAGTAGCCAGCGGGGGGAAAGATCACCT +CACCTCCCTACAGGAAGCCGACAACTTAGAACTGCCAATCGAGAACTCTCTAATCATTTATCTCTCTTAAAGAAATCCTC +CCAGGGGAAAGATCACCTCATCTCCCTATAGGAGGCCAACAAACAACTCAGAACTGCCAATCGAGAACTCTCTAATCGTT +TTATCTCTcttaaagcaatcctcccagggGAAAGATCACCTCATCTCCCTACAGGAGGCCAACAAACAACTCAGAACTGC +CAATCGAGAACTCTCTAATCGTTTATCTCTCTTAAAGAAATCCTCCCAGGGGAAAGATCACCTCATCTCCCTACAGGAGG +CCAACAAACAACTCAGAACTGCCAATCGAGAACTCTCTAATCGTTTCTCTCTCTTAAAGAAATCCTCCAAATCCCTTACT +TCCCCCTTAGTGCCTCCCCTACCTCCACCTACACCCCCACTCTACCCCTAACCCTCTGGACTTCCTGGACCAGACCTCTC +CCGTCCTTTTCCTTCAGGTTCCAATCTACCCTATTCTCCTCCTTGCCTGCCACTCACTGAACCGAAGGCAGTAGGGAATC +TAACTTCCAAGACCCTACTTCCTGCCGATTCTCTGGTATCGCCAGTGGCAGCCTCTCATCTGGAAGACATGGAAAAGGGG +ACACCATAGGGACTCCTCTCATGATCAACCCGTTTCGGGAACAACTGTTAACAGTTAGGGGAACCCCTGTGATTGTCTGT +CAACCCTGGTTAATGGCTGAGTTGCGAGGCATAAAGAATTTCCTGCCTCCCGTAAAGATCCAATTGGGTTTGCTCAAGAG +TTTGAGCTCATTATCAGAATCTATGACCCGGGTCATTCAGACCTTTATCAGCTGGTCCACATACTGGTCTCAGAAGCTAA +AGCTAAGTGGCTGGAAAAGGCACAGTGGTCAGACATTGTAGCAGACCTGATCTCTAAAGGCCCAGGAAGGCCAAATCAGC +CAGCCCCAAATCCTGAAGACAGACGCAGAGATGCTCGTGAACGATGACTGTTCTGTTAAATAGCATTCCTTCAGTGTCCC +GGAGGGTCGTGGATTGGAGGAAAATCCAGCAATGCTGCCAGAGCACAAAGGAATCCGTTTTAGATTATTTCACACGTTTT +GATAAAACTTTTAGACAATATTGCAGGATGTCAGCTGATTgctatgaaaacaataaaaatgctaCAATATTAAATGCAAA +TTTATTAGACTAGGTGATGATTTAGCCACCCTTAGAAAATGCCACATGATAAATTGGGCCACAGCCAGAACTAATGAACT +AGTTAGCTGACCAGTTATCCCGCACtgtgataaaaaggaaaaacagaagattGCCCAAGTTATGCGTTTACAGTTAAAGC +AATTAACTTTTCAAACCTCTCAGCCCCAGAAAGATTTTAAGCTCATAGGTCTGAGGACTCTTCCCTCCCAGTCTGTTACT +AGTGTAAAAGACCAGGacaacggccgggcgcggtggctcacgcctgtaatcccagaactttgggaggctgaggcgggcag +atcatgaggtcaggagatcgagaccatcttggctaacatggtgaaaccccgtctctactaaaaatgcaaaaaaaattagc +cgggcatggtggcgggcgcctgtagtcccagctacttgggaggctgaggcaggagaatggcgtgaacctgggaggcggag +cttgcagtgagcccacatcgcgccactgcactccagcctgggcgacagagcaagactctgactcaaaaaaaaaaaaaaaa +aaaaaaaacggccaggACAGCTTAAGAGGGATTGCAttaggctgaaatgaaagaaaagacaagaaaatgcaGCTCAGGAA +GACTAGGGTGCTCTGAGAAAGTACAGGGTTTCACTGCTCCAAATATTCTACCCTGACAAATTGGGAGAGATTGATATAAT +GATAAACCAGGAGCTTACAACTGCCTGAATTTGACACAGGAGTGGCTATATCTGTTATAAATCCCACCTAATTTAGAAAC +CCCATACGTGAGAGTAATGAAAGAATTAACATGGTGGCTGTGTCTAATAAAACTCTCTCATGTTTTAAGTCTAAACCTAA +ACCTTAACATTTCTTTGGGTTCAGATGCCCCGCCATGGGCTCTAAGTGTGACATGCTCCGTTGGTCCCATGTGTTTCTAA +TGTGCCCTGCAGCCCCGTCAATCTTTTGGGCTGTGATCTCGTCAACATCCATAATGCTCATATCTCTTTTTCATCGAAAG +GTGAGCTTCTTCTAGAATTGAAGCCAGGGACCAACAATATCAACTTAGAAAACATCCTGACAGCAGGAtgcagtcactca +tgcctgtaatcccagcactttgggaggccgaggcaggtggctcacttgaagttaggagttagagatcagccaggccaaca +tggtgaaacctcgtctctagtaaaaatacaaaaattagccaggcgtggtagcgcatgcctgtaatcccagctacttggga +ggctgaggcaggaaaatcacttgaacccaggaggcggaggttgcagtgagccaagactgtgccactgcactccagcctgg +gcgacagagggagactccatctcacaaagaaaaaacaagaccTGACATTGCACCACAATTTAGTACTAGTAATGTTAAGA +CACCATTTTGTGACTGGGGAAGGGAaatgagacagagaaggaaatactagaagagaagagagaacattGGAATAAAGAGC +AGAAAACAGTAAAACTCCTTTTAGCTTCCCCAGTCTTCCTGTTAACTCCAGAAGTGGAGCATTTGCTTAAGGATGTCCCT +TCCCACTTATACTCTCAGTCAAATACAGATATAGGGAAAATATTCTCAGCCACTTCCATAAAGGTAGAGATAAACCTGAA +GAACCCCCTACCCAACCTCAAACAGTATCCTCTCTGACAGGAAGCTACAGATGGAATCGCCCCTGTCATCCAAGATTAAC +TGAAAAGGGGGCTCAACTATTCCCTGCACAAGCCCCTGCAACAGCCCCATATTccctgtaaaacaaaaacaaaaacaaaa +aaaaacaagcaggagGGGACGGAGATTTGTACATGACTTGAGGGCAATAAATATGGCAATACCCAGACACCCAATAATCC +CCAACCCACATATCCTTCTATCAACTATACCCAGTACCAGCTAGTATTTCTCAGTTGCAGATCTCTGCAGTGCCTTCTTT +AGTATTCCTGTAGATCCAGACAGCCGGTATTTGTTTACCTTTACTTGGAAAGAAGGGCAATATATGTGGACGGTAATGCC +TCAAGGGTATACACAAAGTCCCACTTACTTTTCccaaatattaaaagctgattTAGAGGATTTAATTTTTCCCCAGGGCT +CAACACTCATTCAGTATGTAGGTGACCTTCTTTGTTCAGACACACTATCTTCCTCCCAGGAAGATGGTCTATATTTACTC +AAACagccaccaaaggacacaaagtgTCCAAAGACAAACTTCAGCGATGCTTGCTGCAAGTTAAGCATTTGGAGGCATAT +TATCTCAGTCAAAGGACTGAGTATTAACCCTGACAGAGTGAGAGGAATTTTAGCTTTCCCAATGCCTGTTACTAATAAAC +AGCTTAGAGGATTTTGGGTCCTGGCTGGCTATTGTAGAAACTGGATACCAGTGGACAGGcccggtggctcaggcctgtaa +tcccagcactttgggaggctgaggcgggtggatcacgaggtcaggagatcgagaccattctggctaacacggtgaaaccc +cctctgtactaaaaatacaaaaaaattagctgggcgtggtggcagcacctgtagtcccagctactctggaggctgaggca +ggagaatggtgtggacctgggaggcagagcttgcagtgagccgagattgcaccactgcactccagactgaacgacagagc +gagactccgtctcaaaaaaaaaaaaaagaaagaaactggataCCAAATTTCTTCCTTATGGCTCAACCTCTCTATGCATA +CCTAAAAAATGAACAACCTGATCCTGCCTTGTGGAGTCCTGAGGGACAATCAGCTGTACAACAAATAAAGGAAATTCTAA +CTAATGCCCCAGCCTTAGGGCACCCAAACTACAAACTGCCTTTCTCCCTTTTCACACACAAAACTGGAGGTACTGCATCC +AGGGTACTGATCCAGAAACATGGTGATCATCAGAGGCCTATAGGCTATTTTAGCCAACACCTGGACCCGGTGGCTTGAGG +GCTGCCTCCTTGTGTGAGAGCAGTAGCAACCATGGCCCTTCTGTACAAGTCTGTTGAAGAAATAAGTATGGGTTCCCCCC +TTACCATTTCTGTGCCACATTCTCCTGAGACCCTTCTAAACTCTCATCATACTCAACGTGTGTCTGTCAACCGGTCAGCC +TCTTATCAAATTTTGCTTGTACCATCTTCCAATATTACTACTTCCAGTATAATAATCTTAATCTGGCCACTCTCTTGTCA +GGCCCTTCTGACAAGACCCCTCATGACTGTGTTCTGATGACTGACTTCTCACCCCAGGACAGACCTACAAGAGATGCCAC +TGGATCATGCTAAAATAGAATGGTATACGGATGGGTCTTATTTAAGAGGAGAGGATGGAAATTTTGGAGCAGGATATGCT +GTGGTTTCCTTACTAGAGGTAATTAAAGCCGGTCCTCTTCCCGAAGCCAGATCATCTCAAGTGGCTGAGTTGACTGCCCT +GACCCGAGCTTGTCAATTGGCAAAATACAAGGCTGCAAACATTTGCACTGACAGCTGCTATGCTTTTGGGGTTTGCATGA +CTTTGGGATGCTATGGAAAGATGGAGGATATTTAGCCTCCTCAGGGCAACCCATAAAAAATGTACAAGTATCAGAGCTGT +TAGAAGCTattctaggccgggtgcagtgactcacgcctgtaatcccagcactttgggaggccgaggcgggcggatcacat +gaggtcaggagttcaagaccagcctggccgacatagtgaaaccccgtctctactaaaaatacaaaaattagccaggcatg +gtggcgggtgcctgtaatcccagctacttgggaggttgaagcaggagaattgcttgaacccgggaggtggaggttacagt +gagccaagactgcgccactgcacttcagcctgagtgacaagagtgagactccatctcaaaaaaaaaaaaaaaaaaaaagc +tattctaaaacCAAAATAGTTCACAACCATAAAAATCCCAGGTCACTTTAAATTAAACACCACAGAAGTTCAGGGTAACC +AATTGGCTCATGCCACAGCTAAAAGAGCAGCATTTGAGCCAGCCCCAATCCAGAAAATGACCATAAAACTCAAAACACTT +AAAAAcatatgatatagtttggctgtgtccccacccaaatcttaccttacattgtagctcccacaattctcacatgttgt +gggagggacccagtgggagataattgaatcacgggggcagtttccctcatgctgttctcatggtagtgagtaagtctcat +gagatctgatgactttataaggggaaaccccttttgcttggctcccattctctcttgcttgctgccatgtaagatatgcc +tttccccttccaccatgattgtgaggcctccccagccatgtgaaactgtgagtccattaaacctctttttcttgataaat +tacccagtcttgggtgtgtcttcatcagtagtgtgaaaacggagtaatacagtaaattggtaccagcagagtgtggtggt +gttgtaaagatacccaaaaatgtggaagtaactttggaactgggtaacaggtagaggtcggaacagtttggagggctcag +aagaagacaggaacatgtgggaaagtttggaactccctagagacttgttgaatggatttgaccaaaatgctgataatgat +atggacaatgaaatccaggctgaggtggtctcagatggagagaaggaacttgttgggaactagaataaaggtgactcttg +ctatgttttaggaAAGAgaatggtggcattttgcccctgccctagacatctgtgaaactttgaacttgagggagatgatt +taggtaTCTGGcaggaaatttctaagcaaagaaaagcattcaagaggtggctTGGGTGCTGTTAATAAGATTAAGTTTTt +gaagggaaacagcataaaagtttagaaaatgtgCAGTCTGACGatgagatagaaaaggaaaacccattttctgtggagaa +attcaagacagttgcagaaatttgcataagtaatgaggagccaaattttttgtttgtttgttttgtttttttgagatggg +agtctccctctgtcgcccaggctggagtgcagtggcatgatctcggctcaccacaccctcccttcctgggttcaagcaat +tctcctgcctcagcctcctgagtggctgggattacaggcgcctgactccactcccagctaatttttgtatttttagtaga +gacggggtttcaccatgttggccaggctggtcttgaactcctgatctcaagtgatctgcctgccttggcctcccaaagtg +ctggaattacaggcgtgagcaaggagccaaatgttagtcaccaagacaatgagaaaatgtctccagggcatgtcagagac +attcgtggcagcccctcccatcacaggcctggaggcctagtaggaaaaaatgttttttggtgcctggcccagggcccccc +tgctgtgtgcagtctagtaaggacttggtgccctgtgtcccagccactctagcaatggctaaaaggggccaaggtatagc +ttGGGTTGTGGCTTCAAAGAATGGAAGctctaagccttggcagcttccacgtgatgtggagcctgtgggtgcacagaagt +caagaattgaggtttgggaacctctgcctagatttcagaggatatatggaaatgcctggacgTCCAGGCAGGAGTTTGCT +GCAGGgctggggccctcatggagaacctctgctagggcagtgcagaagaaaaatgtgaggttgaagcccccacacagagt +cctcactagggcactgcctagtggagttgtgagaagagggccactgtcctccaaacCCCACACTGGTAGATCCATCAACA +GCTTGCTCTGTGGCCCTGGAagagccacagacactcaactccagcccatgaaagcagccaagagtggggtataccctgca +aagccacagggcagaactgcccaaggccatggaaacccaccttttgcatcagtgtgacccggatgtgagacatggtgtca +aaggaggtcattttggaactttaaagtttaatgactgccccactggatttcggacttgcatgggccttTAGTCCCTtcgt +tttggctaatttctctaATTtcgaatggctgtatttacccaatgactatacccccattgtatctaggaagtaactaactt +acttttgattttacaggctcataggcagaagggacttgctgtGTCCCAGATGAGagtttggactgtgaacttttgagtta +atgctaaaaggactttgggggactgttgggaaggcattattgcttttgaaatatgaggacatgagatttgggaggggcca +gggatgcaatggtattgtttggctgtgtccccacccaaatctcatcttgaattatggctttcataatttccatgtgtttt +ggGAGGGATCTggagggagataattgaattaggGGGGCAGTTTTTCCCCATACTTTTCTCCTGGTAGTGAacaaatctca +caagatctgatggttttataacgggAAACCCTTTCAtgtggctctcattctctcttgcctgctaccatgtgaCTGAAGAA +GCCTCACAAAACATGCAAACAGTACAGATTTCAACAATCCTGAACCAGCAAACAGATGGTCAAGGAATCTGCAAGCCCTG +TGGCACATCTTAATCCGTTAGGTCCTCGTGTCTAGGTGCATGCTGATGAGAATATGTTCCACCAAATCCAGGCTGCCCTC +CTAGGGTGATGGGAAAGCAGAGGGGTTAAGTATGGCACAATCCACAGTTCAGTTCACCAAGTAAGGATGTGAGtaaaatg +ctaaaaacaaaaaagccactgAAGAGTTGCTTATAGAGTCCAAACTCATGTAGAAAAGGCCGTGTCCCTCAAGGCTTTCA +TTGTCACCAACAAATCTGGTATTGAGATTGCCAAGAATGATTACTCAGCCATTAATTTTGCAGAGTAAGaagaaccaatt +tttaaaatgaggctgggtgtaTGGCAGCTCTGAGCATGCCCATATTTAGTATGAAAATGGATGGCACCCCCATTCTAGGA +AATCTCCGCCTTTTCCCtagaaaaccacatgattattccACCCCCTAATTAGAAGAGCACATAAAGTTAGAAACCCAAAC +TCCTTTGTGCATGACCTCTGTCCTAAGTGTGAGCTTTCGCTTTGCAATCAAAGCTCCTGGCTTTCCACTTCACGGACTcg +gccctgaattctttctcacTGTGGTGCCAAAAACCTGGACACCAGCTGTGGCTGGCATCCCACAGGCATCGGTAGACCCT +CTTGAACCCTCCAGCAACAGGTGAGGGACAGGAAGatggcagggagggaggccaggATGTCCTCAGGTGAGTCCAAGTAG +GAAGGCCCACTGCTGGGAAGGGGGCCTGTTGTCCAGTTGGTCACCCTTGAGCTGTCCTGGGAGATGGGTGTGTCTGTCCA +GCTTCAGCCTGGCCAAGCTTACGCACTGACTGCGGCTTTCTCTGCTGTCTTGGCTCCTGGGTGGTCATCATCATATGTCA +TCCTCTGTGGATGACATTTCCTCAAGTCCAGCGCTGGGTGGCTAGGATAGGCCTAGGATGTCCATCCAGCTGGCTACATC +CTCAAGAGCTCATGGGCCCAGCAATGGCCTGAGGAGTACTTCAAGGTGTGTCCAGCACAGGTCCTGTGCAGTGTGACACC +CCACCAGGTGACTTCAGGGTGTGTGCCTGCTGCCTGGACCCAGAGGGCAAGCATGTGAAATCACAGCTCTCTCTTTGGGA +ACAAAAGGAGAAGCAGTTTTTGCATGACTTAGTTCCCAAGCTTAGTTTTCCTTTGGAATTGTGAGTTTGGGGTCTTGAAA +TTCTATTTTCCTTCCAcagcttctttctcttcccttttccttgtctcttcagttataatttttttgttgttgttaatatt +aACTGTCAGGAATTTCTGTTTCCAGAACCTGGCTGATTACATGGTCAGTGATTTCTAGGGCAGcaggagaaatattttaa +aagctacttTTAAGAAGCATTATTTTATTGGAGATGGTGTAtttgttcattctcatgctgctatgaaaaaaCACCTGAGA +ataggtaattcataaaggaaaggggtttaagtgactcacggttccacattgctggggaggcctcaggaaatttacaatca +atgcagaaggcaggtacCCTCTCCACAGGGCTGCAGGATGCAATGAATGCAAGCAGGGCAAATGCCAGACGCTtaacaaa +accatcagatcccgtgagactcactcactatcacaacagcacggggaaaaccacccccacgATCCGGTCACCTCCGCTTG +GTCCTgctcttgacacatggggattacaattcaagatgagattttgggtggggacacagccagaccatgtCAGATAGGTc +actggtctcaaagtcctggcttcAAGAAATCATTCCTTCTTATCCTCCCATatttctgggatcacaggcatgagccacca +taagctacttaaaaaaaaaaaaaagcctggcagtGTAGGCAGGAAAGTTAAGGGATTCCTTGGGGCAGGGTTGGCAGGTA +AGTAGAGGGATTCCCTGAGGTTTATGTTTCCTCCAGGGCTGTTGCTTGCCAGGCGTTTGATGTGCCTGCCCTGAGGGCAG +CCTGAGGCCTGACACCACCACCCAAAGCCTGGACTCTGGGACCGTGGCTCTCAGTGCATGAACTCAGGGAAACTCTCCCT +TCAGAGAAAGGTGGGTAGAGGTGGGTGCCTCTCCCACCTTACCTGTGGGTAGGAAGGGGGCTGCATCTCTCAAGCCAGAA +ACCTGAAGCAGGCACCGAGTGCTCCCTGTCTGAGCCTGGGTGGAAGGGCAGAGATATGTTCCTGAGAATTCCCAGCAATC +TGGGGGGGATTCGCACTTCCTTGCAGCAGGAGCTCTGGGTTACTTTGCTGGTTGAAGAGGAAAGTAAGTGCCAGGCCCTC +GGCTTCGTGAAAGGGGTGGGTGGGCAGGTCAAGGTGGGCTGTGGAACTCCTCTGTGTACACATTGAATTTAGAACTTTAG +CTTTGACCCATGCAGAAGTGAATGTGCACGCTCTGAGAACTCATGAGAAAATCACCAGAGAAAACAAGCCTTGAGCAGAG +TCCCAAGAAACCACAAACAAAACATCGAGTGCGCAGTGGCTGCCGGTCCTGAGATGATCAGACCCAGAATAGGAATGCAT +CTGTGTTTAAAGGAGTGCAAAGCAGTGAGGAGGAGAAAGCATCAGAACTATTGAAGAGTCCAGAACAGAAGCCAGGGAAA +TTCCCAGAAATGAAAATCAGGACAACTAACATTAGAAACAGGATGAATTGGCTAAAAAGGTTAGACATAGCTAAAGAGCT +GCAGTGATAAAAGGGCTGGACTGAATGAGCCCGCTTATGTGAAGTTCAAACACAGGCAAAGCCAAACGTGACCTAGAGAC +ACAGACGTGCAGTAGAACCATGAGGAAGGGCAAGGGATGGTTAACACGAGAGTCTCGTGAGATTGTGATGATCTCCTGGG +GGAACAGCATGCAGAGGGAAGGCCCTGGTGACACCTGAGGCCAATGGGGCATTCTGGACTCGAAGGTTGTTGAGCAGCgg +ccctggcctccacccaccagatgccagtggtGGCCCCTCCCTGTGATGACAACCCAGGCTGTCTGCAGACTTGGCAAGCT +CCCCTGGGGGGAGAAAAATTGCCCCCAGGTGACAGCCATTGAATTAGAGGTTAAGAGTGTTTCAATTGCTTATGCTGTCT +TTCTTAGGGTGGGCAGTGCCTATGTAGGTATTCattctgtattatttaaaatgtctacaTGTGCACCCTTTATTTTAGGA +TATGTTtttcaggaataaaataataatatacattctCTGGCCTAGTAACAGAGAATCCTACTGGATAAATGAATATGGAA +AATAGTTTTCCTGGGAGCTTATAGCAGGGCTTCTCAGTGTTATTTGCTATTTAACTCCGTATTGAAAAAtgttggcaggg +cacagtggctcacatctgtaatcccaacactttgggaggctgaggcgggcagatcacttgaggtcagaagttcaagacca +gcctggccaacatggcaaaaccctgtctctactaaaagtacaagaattagccagcatgatggtgcatgcctgtaatccca +gctacttgggaggctgaggcaggagaattgcttgaaccgcgaggcagaggttgcagttagctgagatcgcaccactgcac +tccagcctgggtgacagagcaagactctgtttcaaaaaagaaaaaagaaaaagaataagaaaaaatatctagGAAGAAGA +ACTTTCCAAAAACAGCTTTATCTAGTGTGGGGACACAATTTtgtagtcccaagtactcaggaggctgaggcaggaggacc +acttgaccttgggggtttgaggccagcctgggcaagatagaccttatctctaaaaaataaacaaggccaggtgtggtggc +tcacgcctgtaatcccagcactttgggaggctaaggcgggcggatcacgaggtcaggagttcgagaccagcctggccaat +atggtgaaaccttgtctctactaaaaatacgaaaattagccgggtgtggtggcatgcgcctgtagtcccagctacttggg +aggctgaggcaggagaatcgcttgaacccaggaggcagaggttgcagtgagccaagatcatgccattgcactccagcctg +ggtgacagagtgagactctgtctcaaaaaaaaaaggaaagtttattagGAAGAATGCTGTaacagagtgcagtggggcgC +CTCAGAGAGAGGATTGAGCGCCCAGTGGTGGATTTTCCATAGGAGCATTTATGGACCTTAAGGCTGGAGCTTAGGGTTGT +AAACTGAGTTTCAGCATGGCATTCCAGAGatgtttagaaattttatttacttataaaagTTGAAAGAGGCCTGGAATTGA +AGCGGTGTCATTTGTCTGGGGTAATACCTGAGGTTTGTTGCCTCATGTCAAGGAAATTGAGGACGTAGACCCACAATGAG +TGAGTTTTAGAGTgggagtttaataggcaaaagagaaAAGTTCCCTTGTGCAGAGTGAGGGGGGTTCTGAGAGGATTTCC +CCCTTTGCAACCAGATGGTTTTATCGATGAGCTTGAgaaggtggtgtctgatttacatagggcacagaggattggttgga +ccaggtgtgccatttacatAGCACTGGAAGAAGCGGCCAACTCATCCTTTTATTATGCAAATCTTTTTATGCAAATGGAG +TCTCTACGCGCCAGGCGCCACGTTGCCTGCTTTTTTACTGCACATGTGAtgacaaagaaaatggaagggGAACCTCCATG +TTGAACATACTTGGCTTCCAGGTatcccttttctattggcacagctgccggcATTCACCTATGCAGGCTTCCAGCTGGCT +TATCTgtgtctgcagctcaattttacaagctgctctttgttagaaaagaaataatttgggggctgctttttgttaaaagg +aaaGCCTTACTGAGGACTCTCTTACCCTTGCagcaggacgagccacagacaaaacctctcagacaccgagttgtagaagg +aagggctttattcagctgggagcatcggcaagctaCTGTCTCAAAATGCGAGGTCCCTgaatgcacaatttctgtccttt +ttaagggctcacaacactaaagatttcataTGAAAGGATTGTGATTGACTGAGCAATCTAGGGGATACGTAACAGGGGTG +TCGTGCAGAGAGAGTCggagagaaacagaacagagcagggagtttcacaatgttcttccATACAATGCCTGAAATCTATG +GGTAACATCGGGTTCTAAGTcaagagttgatttttaactactaggtttaggccaggcaggcccaggcctggttttgggCC +TGGCGctgggctgcctgtctttgatttcacttccttggtttttttttttaatcaggtactgagtataaaacaatatgaaa +caatatgagagggtctctctcttccctcaccctCACTAATGGCCTAAATAAACAATATGAGAgggcctctctcttccctc +accctCACTAACTGCCTAAATGATTTCTTTCTAGTTCCTGTATCATGATCAGATGCCGTCTTCAGATACTAGGGAAGTTT +GATTACTTCTAAATTCCCCAGATAAGGAGTTTTGCCTCCAGATGGCCTGTTTGATGGTCACCAGGTGGTCTTGGCTCCCT +TCTGAATTGCCCAGGTAAGAAGATTTGTCTCTGGGGCCTGTCCAATGGTCACCAGATGATTTTTGCCCTCCtcacagcct +aggcaacatagtaagacccccatctctacaaaaaatgcaaaatcagccaggcgtggcagcgcacgcctgtggtcccagct +actcagaggggctgagacaggaggatcgcttgaacccaggaggtctgGGTTGTAATGAGCCGTgactgcacctctgcact +ccagcctgggagatggagtcttaaaaaaaaatgtttttccggGTGTTATGAGTGTGCTCTCCAGCTTCAGTGTGACTTTG +TTTGTTGGAGACTTGGGTAAGTAtccaaattgtttttcagatGTAAATGTTAGTAGCTCTCATTCATGGGAAGTTATGGT +CTGCTCTTTAAGTTCTATGGGGCTATCTTCACATCTGTGGCACAAGGCTGGTCAGTTTTACATCTGAGAAAAGTGAGTCT +GGGGAAGTGCCCCTGCCATCCATCCCACAGCAGGCAAGCAGCCTCCTCCgcattctcctcttcctcttctcatgcCTGGT +CCTCTCCTTTTGTCATCTTTTCTTCCCTCACCCCATCTTTATTGTATCTCTTTTTGTCTGTctctaaacaatattttaat +tggTTGCATTGTTTTTATGTTCCCTGTTTTTAATATTCATGacattcccttcctctcccccttcctGTCTTCTCCACATT +AAGTTCACCCTCTTTGTACTTTTACCTGAGTGCTTCCCCCCGttctcctccaccccaccccacttcaGGCAATTCTGAGT +GGTACAGAGGTGTCACATCCATCTCCTGGGACTGTATGAGTCTGTCTCACTTTCCTTTGAAAGAAGCAGTTGGAAATTCA +ttctacttcatttttctttctttctttctttcttttgagatggagtctcgctctgtcccccaggctggagtgcagtggca +cgatcgtggctcactgcaacctccgcctcctgagttcaagcgattcttctgcctcagactcccaagtagctgggattaca +ggcgtgagccaccacgcccgactaatttttgtattttcagtagagacaaggtgtcaccatattggccaggctggtctcca +g diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb new file mode 100644 index 0000000..6b7012d Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nhr b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nhr new file mode 100644 index 0000000..2f0f40e Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nhr differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nin b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nin new file mode 100644 index 0000000..3c96f64 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nin differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.njs b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.njs new file mode 100644 index 0000000..27896c8 --- /dev/null +++ b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.njs @@ -0,0 +1,24 @@ +{ + "version": "1.2", + "dbname": "HOMSA@9606@v1.fa", + "dbtype": "Nucleotide", + "db-version": 5, + "description": "../data//assembly_dir//HOMSA@9606@v1/HOMSA@9606@v1.fa", + "number-of-letters": 56001, + "number-of-sequences": 1, + "last-updated": "2025-09-14T19:37:00", + "number-of-volumes": 1, + "bytes-total": 63480, + "bytes-to-cache": 14162, + "files": [ + "HOMSA@9606@v1.fa.ndb", + "HOMSA@9606@v1.fa.nhr", + "HOMSA@9606@v1.fa.nin", + "HOMSA@9606@v1.fa.nog", + "HOMSA@9606@v1.fa.nos", + "HOMSA@9606@v1.fa.not", + "HOMSA@9606@v1.fa.nsq", + "HOMSA@9606@v1.fa.ntf", + "HOMSA@9606@v1.fa.nto" + ] +} diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nog b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nog new file mode 100644 index 0000000..e4342d3 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nog differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nos b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nos new file mode 100644 index 0000000..953c4d1 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nos differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.not b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.not new file mode 100644 index 0000000..d656266 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.not differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nsq b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nsq new file mode 100644 index 0000000..19736e8 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nsq differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ntf b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ntf new file mode 100644 index 0000000..005ac41 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ntf differ diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto new file mode 100644 index 0000000..20d5cb8 Binary files /dev/null and b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto differ diff --git a/fdog/data/conda_requirements.yml b/fdog/data/conda_requirements.yml index e5ef007..d5deb2b 100644 --- a/fdog/data/conda_requirements.yml +++ b/fdog/data/conda_requirements.yml @@ -4,3 +4,5 @@ fasta3=36.3.8i clustalw mafft muscle=5.1 +augustus=3.5.0 +metaeuk diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt index 8a7853b..6b36db4 100644 --- a/fdog/data/dependencies.txt +++ b/fdog/data/dependencies.txt @@ -3,3 +3,5 @@ hmmer clustalw mafft muscle +augustus +metaeuk diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py new file mode 100644 index 0000000..850304a --- /dev/null +++ b/fdog/fDOGassembly.py @@ -0,0 +1,1405 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2021 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + +############################ imports ########################################### +import os +import os.path +import sys +from Bio import SeqIO +from Bio.Phylo.TreeConstruction import DistanceCalculator +from Bio import AlignIO +import argparse +import yaml +import subprocess +import time +import shutil +import multiprocessing as mp +import fdog.libs.alignment as align_fn +import fdog.libs.zzz as general_fn +import fdog.libs.fas as fas_fn +from tqdm import tqdm +from pathlib import Path +import pandas as pd + +########################### functions ########################################## +def check_path(path, exit=True): + if not os.path.exists(path) and exit == True: + print(path + " does not exist. Exciting ...") + sys.exit() + elif not os.path.exists(path) and exit == False: + return 1 + else: + return 0 + +def check_ref_spec(species_list, fasta_file): + """ Checks if reference species is part of the input ortholog group + """ + species_file = set() + with open(fasta_file,"r") as lines: + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.add(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + + +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + return result + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + return result + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + return result + except subprocess.TimeoutExpired: + return 1 + +def merge(blast_results, insert_length): + #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} + number_regions = 0 + insert_length = int(insert_length) + score_list = [] + for key in blast_results: + locations = blast_results[key] + locations = sorted(locations, key = lambda x: int(x[3])) + size_list = len(locations) + j = 0 + while j < size_list-1: + i = j + 1 + while i < size_list: + if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merge overlapping regions plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -=1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -=1 + i += 1 + j += 1 + + for entry in locations: + score_list.append(entry[6]) + number_regions += len(locations) + blast_results[key] = locations + + return blast_results, number_regions, score_list + +def parse_blast(line, blast_results, cutoff): + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} + line = line.replace("\n", "") + line_info = line.split("\t") + evalue = float(line_info[3]) + #cut off + if evalue > cutoff: + return blast_results, evalue + #add region to dictionary + else: + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) + split = node_name.split("|") + # finding out on which strand tBLASTn found a hit + if sstart < send: + strand = "+" + else: + sstart = int(line_info[2]) + send = int(line_info[1]) + strand = "-" + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off + if len(split) > 1: + node_name = split[1] + if node_name in blast_results: + list = blast_results[node_name] + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) + blast_results[node_name] = list + else: + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] + + return blast_results, evalue + +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): + ###################### extracting candidate regions ######################## + # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 + blast_file = open(tmp_path + "/blast_results.out", "r") + evalue = 0 + blast_results = {} + #parsing blast output + while True: + line = blast_file.readline() + #end of file is reached + if not line: + break + #parsing blast output + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) + + if blast_results == {}: + blast_file.close() + return 0,0 + else: + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) + blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) + return candidate_regions, number_regions + +def extract_seq(region_dic, path, tmp_path, mode): + + for key in region_dic: + #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") + cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" + starting_subprocess(cmd, mode) + +def extract_sequence_from_to(name, file, start, end): + #print(name) + out = name + ".fasta" + if int(start) < 0: + start = 0 + with open(out,"w") as f: + for seq_record in SeqIO.parse(file, "fasta"): + f.write(">" + str(seq_record.id) + "\n") + sequence_length = len(seq_record.seq) + if int(end) > sequence_length: + end = sequence_length + #for testing only + #start = 0 + #end = len(seq_record.seq) + f.write(str(seq_record.seq[int(start):int(end)]) + "\n") + + return out, start, end + +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): + """Gene prediction with software Augustus for all candidate regions. The resulting AS sequences will be written in a tmp file.""" + output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Contig/scaffold" + "\t" + "start" + "\t" + "end" + "\n") + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + # some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + # augutus call + cmd = "augustus --protein=1 --gff3=on --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" + #print(cmd) + starting_subprocess(cmd, 'normal') + # transfer augustus output to AS sequence + #print(tmp_path) + #print(key) + cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" + #print(cmd) + starting_subprocess(cmd, mode) + #write region in region file + region.write(key + "\t" + str(start) + "\t" + str(end) + "\n") + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + output.close() + region.close() + +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): + output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Contig/scaffold" + "\t" + "start" + "\t" + "end" + "\n") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + #some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + region.write(key + "\t" + str(start) + "\t" + str(end) + "\n") + #metaeuk call sensitive + #cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files -s 6" + #print(cmd) + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 15 --max-overlap 15 --min-intron 5 --overlap 1 -s 4.5 --remove-tmp-files" + # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".fas", "r") + lines = sequence_file.readlines() + #print(lines) + id = 0 + for line in lines: + if line[0] == ">": + id += 1 + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n" + output.write(header) + else: + output.write(line) + sequence_file.close() + + gff_file = open(tmp_path + name + ".gff", "r") + lines = gff_file.readlines() + new_lines = [] + for line in lines: + values = line.split("\t") + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) + new_lines.append("\t".join(values)) + gff_file.close() + gff_file = open(tmp_path + name + ".gff", "w") + for line in new_lines: + gff_file.write(line) + gff_file.close() + except FileNotFoundError: + pass + region.close() + output.close() + +def searching_for_db(assembly_path): + + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] + check = True + for end in db_endings: + if not any(File.endswith(end) for File in os.listdir(assembly_path)): + check = False + return check + +def get_distance_biopython(file, matrix): + """ Reads alignment file and returns distance matrix """ + #print(file) + input_handle = open(file) + aln = AlignIO.read(input_handle, 'fasta') + try: + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) + except ValueError: + #print('The amino acid U is scored as C during distance calculation for file %s'%(file)) + for record in aln: + new_seq = record.seq.replace('U', 'C') + record.seq = new_seq + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) + input_handle.close() + return dm + +def readFasta(fasta): + path = Path(fasta) + if path.exists() == False: + print(str(path) + ' does not exists.') + sys.exit() + seq_records = SeqIO.parse(path, "fasta") + return seq_records + +def getSeedInfo(path): + dic = {} + seq_records = readFasta(path) + for entry in seq_records: + species = entry.id.split("|")[1] + geneID = entry.id.split("|")[2] + + try: + dic[species].append(geneID) + except KeyError: + dic[species] = [geneID] + + del seq_records + return dic + +def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path, mode='silent'): + ###########getting sequences and write all in one file to make msa ######### + name_file = candidate_name + ".co" + output_file = tmp_path + name_file + '.fa' + aln_file = tmp_path + name_file + '.aln' + genome_dir_path = dataPath + '/searchTaxa_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + if not os.path.exists(genome_dir_path): + genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + #print(searchTool) + + out = open(output_file, "w") + inSeq = SeqIO.to_dict((SeqIO.parse(open(genome_dir_path), 'fasta'))) + out.write(">" + best_hit + "\n") + out.write(str(inSeq[best_hit].seq) + "\n") + out.write(">" + ref + "\n") + out.write(str(inSeq[ref].seq )+ "\n") + #print(candidatesOutFile) + candidates = readFasta(candidatesOutFile) + for record in candidates: + if candidate_name in record.id: + out.write(">" + candidate_name + "\n") + out.write(str(record.seq) + "\n") + break + + out.close() + + if msaTool == "muscle": + if align_fn.get_muscle_version(msaTool) == 'v3': + cmd = "muscle -quiet -in " + output_file + "-out " + aln_file + else: + cmd = "muscle -align " + output_file + " -output " + aln_file + starting_subprocess(cmd, mode) + if not os.path.exists(aln_file): + print('Muscle failed with command: %s'%(cmd)) + print("Muscle failed for file %s. Making MSA with Mafft-linsi." % (candidate_name)) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file + starting_subprocess(cmd, mode) + + elif msaTool == "mafft-linsi": + #print("mafft-linsi") + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file + starting_subprocess(cmd, mode) + + try: + distances = get_distance_biopython(aln_file, matrix) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) + except ValueError: + pass + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" + + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] + + if distance_ref_hit < distance_hit_query: + #accepted + return 1, distance_ref_hit, distance_hit_query + + else: + #rejected + return 0, distance_ref_hit, distance_hit_query + +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): + # the backward search uses the genes predicted from augustus and makes a blastp search + #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa + seedDic = getSeedInfo(fasta_path) + #print(fasta_path) + orthologs = [] + #print(seedDic) + blast_dir_path = dataPath + "/coreTaxa_dir/" + #print(blast_dir_path) + if not os.path.exists(blast_dir_path): + blast_dir_path = dataPath + "/blast_dir/" + #print(blast_dir_path) + if strict != True: + seed = [fdog_ref_species] + try: + id_ref = seedDic[fdog_ref_species] + except KeyError: + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + return 0, seed + if searchTool == "blast": + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) + else: + print("diamonds are the girls best friends") + ##### diamond call + + alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene, evalue = (line.replace("\n", "")).split("\t") + gene_name = gene.split("|")[2] + if gene_name != old_name: + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" + min = float(evalue) + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") if mode == "debug" else "" + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + orthologs.append(gene) + elif co_orthologs_result == 0: + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + else: + print("\tnothitting\n") if mode == "debug" else "" + elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") if mode == "debug" else "" + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + orthologs.append(gene) + elif co_orthologs_result == 0: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + else: + print("\tnot hitting\n") if mode == "debug" else "" + old_name = gene_name + + + if orthologs == []: + #print("No hit in the backward search, ...exciting") + return 0, seed + + else: + if taxa != []: + seed = taxa + try: + i = seed.index(fdog_ref_species) + seed.insert(0,seed.pop(i)) + except ValueError: + seed.insert(0,fdog_ref_species) + #print(seed) + #print("with taxa list from user input") + + else: + seed = [] + for key in seedDic: + if key == fdog_ref_species: + seed.insert(0,key) + else: + seed.append(key) + + orthologs = set({}) + + for species in seed: + #print("backward search in species %s\n" %species) + orthologs_new = set({}) + try: + id_ref = seedDic[species] + except KeyError: + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") + return 0, seed + #cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + results = starting_subprocess(cmd, mode) + if results.returncode != 0: + print("Blastp failed with the command: %s"%(results.args)) + sys.exit() + alg_file = open(tmp_path + "/blast_" + species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene_name, evalue = (line.replace("\n", "")).split("\t") + if gene_name != old_name: + min = float(evalue) + if id in id_ref: + orthologs_new.add(gene_name) + + elif (gene_name == old_name) and float(evalue) == min: + if id in id_ref: + orthologs_new.add(gene_name) + + #print(species) + #print(orthologs_new) + #print(orthologs) + if species == fdog_ref_species: + orthologs = orthologs_new + else: + orthologs = orthologs & orthologs_new + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") + return 0, seed + + #print(orthologs) + orthologs = set(orthologs) + return list(orthologs), seed + +def addRef(output, core_fasta, species_list): + #print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + #print(item) + candidate_fasta = item[1] + sequenceIds = item[0] + if sequenceIds == 0 or sequenceIds == []: + continue + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + +def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + + output_file = open(output, "a+") + if refBool == False: + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + return 0 + +def createFasInput(orthologsOutFile, mappingFile): + with open(orthologsOutFile, "r") as f: + fas_seed_id = (f.readline())[1:-1] + #fas_seed_id = fas_seed_id.split("|")[0] + + mappingFile = open(mappingFile, "a+") + + seq_records = readFasta(orthologsOutFile) + for seq in seq_records: + ncbi_id = (seq.id.split("@"))[1] + mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") + + mappingFile.close() + return fas_seed_id + +def cleanup(tmp, tmp_path): + if tmp == False: + timeout = time.time() + 60*1 + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break + else: + # clean up whole contigs + for root, dirs, files in os.walk(tmp_path): + for file in files: + if file.endswith(".fasta"): + os.remove(os.path.join(root, file)) + +def getLocationFromGff(gff_file, name, tool): + #print(name) + if tool == 'metaeuk': + gene_count = int(name.split('_')[-1:][0]) + else: + gene_count = int(name.split('.')[-2].replace('g', '').split('_')[-1:][0]) + counter = 0 + with open(gff_file,'r') as gff: + for line in gff: + if line.startswith('#'): + pass + else: + contig, source, type, start, end, score, strand, phase, att = line.split('\t') + if type == 'gene': + counter += 1 + if counter == gene_count: + position = [contig, int(start), int(end), strand] + #print(position) + return position + +def checkOverlap(position, n=30): + pairs = set() + overlapping = set() + keys = list(position.keys()) + index = 0 + for x in keys: + index +=1 + for i in range(index,len(keys)): + y = keys[i] + if x != y: + if position[y][0] == position[x][0]: + if position[y][3] == position[x][3]: + if position[x][1] < position[y][1] and position[y][1] <= position[x][2]: + len_overlap = position[x][2] - position[y][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[x][1] == position[y][1]: + len_overlap = min(position[x][2],position[y][2]) - position[x][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[x][2] == position[y][2]: + len_overlap = position[x][2] - max((position[x][2],position[y][2])) + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[y][1] < position[x][1] and position[x][1] <= position[y][2]: + len_overlap = position[y][2] - position[x][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + return pairs, overlapping + +def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, isoforms, gene_prediction, mode='silent'): + if len(candidate_names) == 1: + return candidate_names + + candidates = readFasta(candidatesFile) + ref = readFasta(fasta) + + out = tmp_path + '/checkCoorthologs.fa' + f = open(out,"w") + + aln_file = tmp_path + '/checkCoorthologs.aln' + + for record in ref: + if fdog_ref_species in record.id: + ref_id = record.id + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + break + + already_written = [] + for record in candidates: + for name in candidate_names: + if name == record.id: + if name not in already_written: + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + already_written.append(name) + f.close() + + if msaTool == "muscle": + if align_fn.get_muscle_version(msaTool) == 'v3': + cmd = "muscle -quiet -in %s -out %s" % (out, aln_file) + else: + cmd = "muscle -align %s -output %s" % (out, aln_file) + starting_subprocess(cmd, mode) + if not os.path.exists(aln_file): + print('Muscle failed with command: %s' % (cmd)) + print("Muscle failed for file %s. Making MSA with Mafft-linsi." % (aln_file)) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file + starting_subprocess(cmd, mode) + elif msaTool == "mafft-linsi": + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet %s > %s'% (out, aln_file) + starting_subprocess(cmd, mode) + + distances = get_distance_biopython(aln_file, matrix) + + min_dist = 10 + min_name = None + position = {} + for name in candidate_names: + distance = distances[ref_id , name] + id = name.split('|')[2] + if isoforms == False: + gff_file = tmp_path + '/' + '_'.join(id.split('_')[0:-1]) + '.gff' + position[name] = getLocationFromGff(gff_file, id, gene_prediction) + if distance <= min_dist: + min_dist = distance + min_name = name + + checked = [min_name] + pairs, overlapping = checkOverlap(position) + #print(pairs, overlapping) + tested = set() + for name in candidate_names: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: + if isoforms == False and name in overlapping and name not in tested: + for pair in pairs: + min_dist = 10 + to_add = '' + if name in pair: + x,y = pair + tested.add(x) + tested.add(y) + distx = distances[x,ref_id] + disty = distances[y, ref_id] + if distx <= disty and distx < min_dist: + to_add = x + min_dist = distx + elif disty <= distx and disty < min_dist: + to_add = y + min_dist = disty + if to_add != min_name: + checked.append(to_add) + elif name in tested and isoforms == False: + pass + else: + checked.append(name) + #print(checked) + return checked + +def clean_fas(path, file_type): + file = open(path, "r") + lines = file.readlines() + file.close() + file = open(path,"w") + + for line in lines: + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t", 1) + id = long_id.split("|")[0] + new_line = id + "\t" + remain + + file.write(new_line) + file.close() + +def run_fas(cmd): + #print(cmd) + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + while process.poll() is None: + output = process.stdout.readline().decode().split('\n') + error = process.stderr.readline().decode().split('\n') + if error: + cmd_out = '' + for line in error: + line.strip() + if 'error' in line or 'Error' in line: + for i in cmd: + if '|' in i: + cmd_out += " '"+ str(i) + "'" + else: + cmd_out += " " + str(i) + print ("Error running FAS with %s"%(cmd_out)) + process.terminate() + sys.exit() + return output + +def ortholog_search_tblastn(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms) = args + output = [] + asNamePath = asName.replace('@', '_') + cmd = 'mkdir ' + out + '/tmp/' + asNamePath + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asNamePath + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + + output.append("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + if not os.path.exists(blast_dir_path): + cmd = 'mkdir ' + blast_dir_path + starting_subprocess(cmd, 'silent') + db_check = searching_for_db(blast_dir_path) + + if db_check == 0: + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() + exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start + if exit_code == 1: + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output + + output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output + + else: + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + if gene_prediction == "augustus": + ############### make Augustus PPX search ################################### + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + else: + time_metaeuk_start = time.time() + if metaeuk_db == '': + db = fasta_path + else: + db = metaeuk_db + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) + time_metaeuk_end = time.time() + time_metaeuk = time_metaeuk_end - time_metaeuk_start + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + #print("No genes found at candidate regions\n") + return [], candidatesOutFile, output + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile, output + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix, isoforms, gene_prediction) + + return reciprocal_sequences, candidatesOutFile, output + +def blockProfiles(core_path, group, mode, out, msaTool): + + ######################## paths ################################ + msa_path = core_path + "/" + group +"/"+ group + ".aln" + if not os.path.exists(msa_path): + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) + if msaTool == "muscle": + if align_fn.get_muscle_version(msaTool) == 'v3': + print("muscle -quiet -in " + fasta_path + " -out " + msa_path) + else: + cmd = "muscle -quiet -align " + fasta_path + " -output " + msa_path + elif msaTool == "mafft-linsi": + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path + starting_subprocess(cmd, mode) + + profile_path = out + "/tmp/" + group + ".prfl" + + ######################## block profile ##################################### + + print("Building a block profile ...", flush=True) + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("\t ...finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + print(" \t ...finished \n", flush=True) + + return profile_path + +def consensusSequence(core_path, group, mode, out): + + ######################## paths ################################ + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) + consensus_path = out + "/tmp/" + group + ".con" + + ######################## consensus sequence ################################ + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("\t ...finished\n") + + return consensus_path + +def createGff(ortholog_sequences, out_folder, tool): + #print(ortholog_sequences) + #print(out_folder) + gff_folder = out_folder + "/gff/" + os.system('mkdir %s >/dev/null 2>&1' %(gff_folder)) + types_set = set(['gene', 'CDS', 'transcript', 'mRNA', 'exon']) + for s in ortholog_sequences: + genes = s[0] + #print(genes) + data = [] + if genes != []: + gff_file_sp = gff_folder + '/' + genes[0].split('|')[1] + '.gff' + for gene in genes: + if gene == '': + continue + #print(gene.split('|')) + group, species, gene = gene.split('|') + #print(group, species, gene) + region = '_'.join(gene.split('_')[0:-1]) + if tool == 'metaeuk': + gene_count = int(gene.split('_')[-1:][0]) + else: + gene_count = int(gene.split('.')[-2].replace('g', '').split('_')[-1:][0]) + #print(region, gene_count) + gff_file_gene = "%s/tmp/%s/%s.gff" %(out_folder, species.replace('@', '_'), region) + #print(gff_file_gene) + with open(gff_file_gene, 'r') as gff: + counter = 0 + for line in gff: + if line.startswith('#'): + pass + else: + line=line.rstrip() + contig, source, type, start, end, score, strand, phase, att = line.split('\t') + if type == 'gene': + counter += 1 + if counter == gene_count: + if source == 'AUGUSTUS': + att = att.replace('g' + str(gene_count), group + '_' + '_'.join(gene.split('.')[:-1])) + att = att.replace('"', '') + if type not in types_set: + continue + elif source == 'MetaEuk': + #att = 'ID=' + group + '_' + gene + '; ' + att + if type == 'gene': + att_entries = att.split(';') + for x in att_entries: + if x.startswith('Target_ID='): + target = x + elif x.startswith('TCS_ID='): + parent_prefix = x.replace('TCS_ID=', '') + att = att.replace('TCS_ID=', 'ID=') + att = att.replace(parent_prefix, group + '_' + gene) + att = att.replace(target + ';', '') + phase = 0 + data.append([contig, source, type, int(start), int(end), score, strand, phase, att]) + else: + continue + + df = pd.DataFrame(data, columns=['contig', 'source', 'type', 'start', 'end', 'score', 'starnd', 'phase', 'att']) + #print(df) + df.sort_values(by=['contig', 'start']) + df.to_csv(gff_file_sp,sep='\t' , index=False, header=None) + +def getAugustusRefSpec(mapping_augustus): + dict = {} + with open(mapping_augustus,'r') as file: + for line in file: + line = line.rstrip() + assembly, id = line.split('\t') + dict[assembly] = id + return dict + +def main(): + + #################### handle user input ##################################### + + start = time.time() + version = '0.1.5.2' + ################### initialize parser ###################################### + parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') + parser.add_argument('--version', action='version', version=str(version)) + ################## required arguments ###################################### + required = parser.add_argument_group('Required arguments') + required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', + action='store', default='', required=True) + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) + ################## optional arguments ###################################### + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) + optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:20000)', action='store', default=20000, type=int) + optional.add_argument('--assemblyPath', help='Path for the assembly directory, (default dataPath)', action='store', default='') + optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) + optional.add_argument('--out', help='Output directory', action='store', default='') + optional.add_argument('--dataPath', help='fDOG data directory containing searchTaxa_dir, coreTaxa_dir and annotation_dir', action='store', default='') + optional.add_argument('--coregroupPath', help='core_ortholog directory containing ortholog groups of gene of interest', action='store', default='') + #optional.add_argument('--searchTool', help='Choose between blast and diamond as alignment search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') + optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) + optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) + optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. (default:muscle)', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') + optional.add_argument('--checkCoorthologsOff', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_false', default=True) + optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. (default: blosum62)', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--fasoff', help='Turn off FAS support', action='store_true', default=False) + optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') + optional.add_argument('--searchTaxa', help='List of Taxa to search in, (default: all species located in assembly_dir)', action='store', nargs="+", default=[]) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed, caution: using --parallel can result in messy output', action='store_true', default=False) + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files, caution: reference species must be identical', action='store_true', default=False) + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) + optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) + optional.add_argument('--augustusRefSpec', help='Augustus reference species identifier (use command: augustus --species=help to get precomputed augustus gene models)', action='store', default='') + optional.add_argument('--augustusRefSpecFile', help='Mapping file tab seperated containing Assembly Names and augustus reference species that should be used', action='store', default='') + optional.add_argument('--metaeukDb', help='Path to MetaEuk reference database', action='store', default='') + optional.add_argument('--isoforms', help='All Isoforms of a gene passing the ortholog verification will be included in the output', action='store_true', default=False) + optional.add_argument('--gff', help='GFF files will be included in output', action='store_true', default=False) + args = parser.parse_args() + + # required + group = args.gene + fdog_ref_species = args.refSpec + #paths user input + assemblyDir = args.assemblyPath + dataPath = args.dataPath + core_path = args.coregroupPath + out = args.out + pathFile = args.pathFile + #I/O + tmp = args.tmp + strict = args.strict + checkCoorthologs = args.checkCoorthologsOff + # print(checkCoorthologs) + #others + average_intron_length = args.avIntron + length_extension = args.lengthExtension + #searchTool = args.searchTool + searchTool = 'blast' + evalue = args.evalBlast + msaTool = args.msaTool + matrix = args.scoringmatrix + taxa = args.coreTaxa + fasoff = args.fasoff + searchTaxa = args.searchTaxa + debug = args.debug + force = args.force + append = args.append + parallel = args.parallel + augustus_ref_species = args.augustusRefSpec + mapping_augustus = args.augustusRefSpecFile + metaeuk_db = args.metaeukDb + isoforms = args.isoforms + gff = args.gff + + #gene prediction tool + augustus = args.augustus + if augustus == True: + if augustus_ref_species == '' and mapping_augustus == '': + print("Augustus reference species is required when using Augustus as gene prediction tool") + return 1 + gene_prediction = "augustus" + if mapping_augustus != '': + check_path(mapping_augustus) + aug_ref_dict = getAugustusRefSpec(mapping_augustus) + else: + gene_prediction = "metaeuk" + if metaeuk_db == '': + print("MetaEuk DB is required when using MetaEuk as gene prediction tool") + return 1 + + # output modes + if debug == True: + mode = 'debug' + else: + mode = 'normal' + + #checking paths + if dataPath == '': + fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') + configFile = fdogPath + '/bin/pathconfig.yml' + if not os.path.exists(configFile): + sys.exit( + f'No pathconfig.yml found at {configFile}. Please run fdog.setup ' + + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + if pathFile: + configFile = os.path.abspath(pathFile) + cfg = general_fn.load_config(configFile) + try: + dataPath = cfg['datapath'] + except: + dataPath = os.getcwd() + + if out == '': + out = os.getcwd() + else: + if out[-1] != "/": + out = out + "/" + if not os.path.exists(out): + os.mkdir(out) + check_path(out) + out = os.path.abspath(out) + + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + elif append == True: + out = out + '/' + group + '/' + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + + if core_path == '': + core_path = out + '/core_orthologs/' + if check_path(core_path, False) == 1: + core_path = dataPath + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + check_path(core_path) + + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + check_path(assemblyDir) + + if metaeuk_db != '': + check_path(metaeuk_db) + + ################## How to handle std output and std error ################## + + if mode == 'silent': + sys.stderr = False + sys.stdout = False + else: + pass + + ########################### other variables ################################ + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + if len(searchTaxa) > 1: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa + else: + if searchTaxa[0] in os.listdir(assemblyDir): + assembly_names = searchTaxa + elif os.path.isfile(searchTaxa[0]): + with open(searchTaxa[0]) as file: + lines = file.readlines() + assembly_names = [line.rstrip() for line in lines] + else: + print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) + + ################################# paths #################################### + + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) + tmp_folder = out + "/tmp" + + ########### is/are fDOG reference species part of ortholog group? ########## + + fdog_ref_species = check_ref_spec(fdog_ref_species, fasta_path) + + ###################### create tmp folder ################################### + + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') + + print("Gene: " + group, flush=True) + print("fDOG reference species: " + fdog_ref_species + " \n",flush=True) + + ###################### preparations ######################################## + + if augustus == True: + group_computation_time_start = time.time() + consensus_path = core_path + '/' + group + '/' + group + '.con' + if check_path(consensus_path, exit=False) == 1: + consensus_path = consensusSequence(core_path, group, mode, out) + print(consensus_path) + profile_path = core_path + '/' + group + '/' + group + '.prfl' + if check_path(profile_path, exit=False) == 1: + profile_path = blockProfiles(core_path, group, mode, out, msaTool) + print(profile_path) + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + else: + #print("test") + profile_path = "" + group_computation_time_start = time.time() + consensus_path = core_path + '/' + group + '.con' + if check_path(consensus_path, exit=False) == 1: + consensus_path = consensusSequence(core_path, group, mode, out) + #concatinade core_group sequences if metaeuk should be run without tblastn + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + + + ###################### ortholog search ##################################### + + ortholog_sequences = [] + time_ortholog_start = time.time() + + if parallel == True: + ##################### parallel computation ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + if mapping_augustus == '': + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + else: + try: + calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + except KeyError: + print("%s is not included in Augustus reference species mapping file. %s will be skipped" %(asName, asName)) + + print("Searching for orthologs ...", flush=True) + for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): + ortholog_sequences.append([i[0], i[1]]) + if mode == 'debug': + for k in i[2]: + print(k) + print("\t ...finished \n", flush=True) + + else: + ###################### computation species wise ################ + for asName in tqdm(assembly_names): + if mapping_augustus == '': + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + else: + try: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + except KeyError: + print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + if mode == 'debug': + for k in output_ortholog_search: + print(k) + + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + "_og.fa" + + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + + if gff == True: + createGff(ortholog_sequences, out, gene_prediction) + mappingFile = out + "/tmp/" + group + ".mapping.txt" + + if fasoff == False: + fas = time.time() + print("Calculating FAS scores ...", flush=True) + + tmp_path = out + '/tmp/' + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + cmd = ['fas.run', '--seed', fasta_path , '--query' , orthologsOutFile , '--annotation_dir' , tmp_path + 'anno_dir' ,'--bidirectional', '--tsv', '--phyloprofile', mappingFile, '--seed_id', fas_seed_id, '--out_dir', out, '--out_name', group] + # print(cmd) + fas_out = run_fas(cmd) + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n", flush=True) + end = time.time() + time_fas = end - fas + else: + end = time.time() + time_fas = 0 + + ################# remove tmp folder ######################################## + + print( + f"fDOG-Assembly finished completely in {round(end-start,2)}s (" + f" Group preparation: {round(time_group,2)}s \t" + f"Ortholog search: {round(time_ortholog,2)}s \t" + f"FAS: {round(time_fas,2)}s)" + ) + print(f"Outputs are saved at {out}") + sys.stdout = sys.__stdout__ + cleanup(tmp, tmp_folder) + +if __name__ == '__main__': + main() diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index f18e141..18c2a9f 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -40,7 +40,7 @@ def parsing_paths(args): pathconfigFile = fdog_path + '/bin/pathconfig.yml' if not os.path.exists(pathconfigFile): sys.exit( - f'No pathconfig.txt found at {pathconfigFile}. Please run fdog.setup ' + f'No pathconfig.yml found at {pathconfigFile}. Please run fdog.setup ' + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') if pathFile: @@ -48,7 +48,7 @@ def parsing_paths(args): cfg = general_fn.load_config(pathconfigFile) try: - data_path = cfg['dataPath'] + data_path = cfg['datapath'] except: data_path = os.getcwd() diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py index 57efcaf..fd3b55e 100644 --- a/fdog/libs/tree.py +++ b/fdog/libs/tree.py @@ -135,7 +135,7 @@ def abbr_ncbi_name(ncbi_name): E.g. "Homo sapiens" -> "HOMSA" """ if not ncbi_name.startswith('UNK'): - ncbi_name = re.sub('[^a-zA-Z1-9\s]+', '', ncbi_name) + ncbi_name = re.sub('[^a-zA-Z1-9\\s]+', '', ncbi_name) tax_name = ncbi_name.split() name = tax_name[0][:3].upper()+tax_name[1][:2].upper() else: diff --git a/fdog/makeCoreGroupFromFasta.py b/fdog/makeCoreGroupFromFasta.py new file mode 100644 index 0000000..566c203 --- /dev/null +++ b/fdog/makeCoreGroupFromFasta.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2021 Hannah Muelbaier +# +# This script is used to prepare the core group used as input for fDOG-Assembly from a fasta file of an ortholog group. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + +############################ imports ################################## +import os +import os.path +import sys +import argparse +import fdog.libs.alignment as align_fn +import fdog.libs.zzz as general_fn + +def check_fasta(file): + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) + if not nHeader == nSeq: + return(1) + return(0) + +def make_single_line_fasta(input, gene, out_folder): + output = out_folder + gene + ".fa" + with open(input, 'r') as f_input, open(output, 'w') as f_output: + block = [] + for line in f_input: + if line.startswith('>'): + if block: + f_output.write(''.join(block) + '\n') + block = [] + f_output.write(line) + else: + block.append(line.strip()) + + if block: + f_output.write(''.join(block) + '\n') + return (output) + +def makeMSA(out_folder, gene, fasta_file): + aln_file = out_folder + gene + '.aln' + if align_fn.get_muscle_version('muscle') == 'v3': + os.system('muscle -quiet -in %s -out %s' % (fasta_file, aln_file)) + else: + os.system('muscle -align %s -output %s' % (fasta_file, aln_file)) + return aln_file + +def makeHMM(out_folder, gene, aln_file): + hmm_dir = out_folder + 'hmm_dir' + os.system('mkdir %s >/dev/null 2>&1' % (hmm_dir)) + out_file = '%s/%s.hmm' % (hmm_dir, gene) + hmmbuild_cmd = 'hmmbuild --amino %s %s' % (out_file, aln_file) + os.system(hmmbuild_cmd) + return out_file + + +def main(): + + #################### handle user input ##################################### + version = '0.0.1' + ################### initialize parser ###################################### + parser = argparse.ArgumentParser(description='You are running fdog.addCoreGroup version ' + str(version) + '.') + ################## required arguments ###################################### + required = parser.add_argument_group('Required arguments') + required.add_argument('--fasta', help='Path to fasta file of ortholog group.', action='store', default='', required=True) + required.add_argument('--out', help='Path to output folder.', action='store', default='', required=True) + required.add_argument('--geneName', help='Core group name', action='store', default='', required=True) + args = parser.parse_args() + + fasta_file_input = args.fasta + out_folder = args.out + gene = args.geneName + + + out_folder = out_folder + '/' + gene + '/' + os.system('mkdir %s >/dev/null 2>&1' % (out_folder)) + + if check_fasta(fasta_file_input) == 1: + fasta_file = make_single_line_fasta(fasta_file_input, gene, out_folder) + else: + fasta_file = out_folder + gene + '.fa' + os.system('cp ' + fasta_file_input + ' ' + fasta_file) + + aln_file = makeMSA(out_folder, gene, fasta_file) + hmm_file = makeHMM(out_folder, gene, aln_file) + + print('Core group located at %s. Fasta file: %s; MSA: %s; HMM: %s' % (out_folder, fasta_file, aln_file, hmm_file)) + +if __name__ == '__main__': + main() diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index 9e7d0c0..7949dde 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -40,7 +40,7 @@ def main(): fdog_version = version("fdog") parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('-i', '--input', - help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + help='Input directory, where all single output (o_g.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) parser.add_argument('-o', '--output', help='Output name', action='store', default='', required=True) args = parser.parse_args() @@ -58,6 +58,7 @@ def main(): domains_2 = None domains_3 = None ex_fasta = None + og_fasta = None lines_seen = set() lines_seen_2 = set() lines_seen_3 = set() @@ -121,6 +122,18 @@ def main(): if not seq in fa_seq_id: ex_fasta_out.write('>%s\n%s\n' % (seq, inSeq[seq].seq)) fa_seq_id.add(seq) + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + ex_fasta_out.write(line) + elif infile.endswith('_og.fa') and not infile == out + '_og.fa': + if not og_fasta: + og_fasta = out + '_og.fa' + og_fasta_out = open(og_fasta, 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + og_fasta_out.write(line) if phyloprofile: phyloprofile_out.close() @@ -130,6 +143,9 @@ def main(): domains_1.close() if ex_fasta: ex_fasta_out.close() + if og_fasta: + og_fasta_out.close() + ex_fasta = og_fasta createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out) print('Done! Output files:\n%s/%s.*' % (directory,out)) diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 184de6f..cad5582 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -174,6 +174,9 @@ def download_data(dataPath, resetData): os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) + if not 'assembly_path' in general_fn.read_dir(dataPath): + os.makedirs(f'{dataPath}/assembly_path') + shutil.copytree(f'{get_source_path()}/data/assembly_dir', f'{dataPath}/assembly_dir') check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast --ignoreAnno' % (dataPath, dataPath, dataPath) try: print('Checking downloaded data...') @@ -197,6 +200,7 @@ def write_pathconfig(fdogPath, dataPath): cf.write('corepath: \'%s/coreTaxa_dir\'\n' % dataPath) cf.write('searchpath: \'%s/searchTaxa_dir\'\n' % dataPath) cf.write('annopath: \'%s/annotation_dir\'\n' % dataPath) + cf.write('assemblypath: \'%s/assembly_dir\'\n' % dataPath) def main(): diff --git a/setup.py b/setup.py index a35b396..53b242c 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ author_email="tran@bio.uni-frankfurt.de", url="https://github.com/BIONF/fDOG", packages=find_packages(), - package_data={'': ['*']}, + include_package_data=True, install_requires=[ 'biopython', 'tqdm', @@ -44,6 +44,7 @@ 'PyYAML', 'pyhmmer', 'pysam', + 'pandas', 'greedyFAS>=1.11.2' ], entry_points={ @@ -58,7 +59,8 @@ "fdog.mergeOutput = fdog.mergeOutput:main", "fdog.uninstall = fdog.removefDog:main", "fdog.assembly = fdog.fDOGassembly:main", - "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"], + "fdog.addAssembly = fdog.addAssembly:main", + "fdog.addCoreGroup = fdog.makeCoreGroupFromFasta:main"], }, license="GPL-3.0", classifiers=[