From 549af0cdbae090d3dcbca5c35a4a6064137a05f9 Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Wed, 3 Apr 2024 14:42:34 +0200 Subject: [PATCH 1/9] added uniformed docstrings and pytests --- .gitignore | 160 ++++++++++ README.md | 1 + access_genbank.py | 118 +++++++ process_fastq.py | 287 ++++++++++++++++++ rbcl_Qiagen_tomato_5000.fastq.png | Bin 0 -> 24917 bytes run_command.py | 64 ++++ .../rbcL_Qiagen_tomato_5000_test_file.fastq | 8 + test_utils.py | 47 +++ 8 files changed, 685 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 access_genbank.py create mode 100644 process_fastq.py create mode 100644 rbcl_Qiagen_tomato_5000.fastq.png create mode 100644 run_command.py create mode 100644 test_material/rbcL_Qiagen_tomato_5000_test_file.fastq create mode 100644 test_utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..d581ac4 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# utils \ No newline at end of file diff --git a/access_genbank.py b/access_genbank.py new file mode 100644 index 0000000..32ef2a8 --- /dev/null +++ b/access_genbank.py @@ -0,0 +1,118 @@ +from Bio import SeqIO, Entrez +import pandas as pd +import os.path as ospath + +Entrez.email = 'contact@genorobotics.org' + +#------------------------------- +#extract information from ncbi +#------------------------------- + +def build_search_term(gene_name, min_len=0, max_len=-1): + """ generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name + so a search query keywords must be used instead. + Used to make database queries in extract_database and download_database + + Parameters + ---------- + gene_name: (str) name of the gene of interest + min_len: (int, optional) lower bound of the sequence length filter, by default 0 + max_len: (int, optional) upper bound of the sequence length filter, by default -1 means no limit + + Returns + ---------- + term: (str) search query + """ + term = f"{gene_name}[Gene Name]" + if max_len == -1: + term += f" NOT 0:{min_len}[Sequence Length]" + else: + term += f" AND {min_len}:{max_len}[Sequence Length]" + return term + + +def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1): + '''Request data on GenBank from NCBI (with Entrez module), with filtering options + + Parameters + ---------- + gene_name: (str) name of the gene to extract from GenBank + seqlen_start: (int, optional) lower bound of the sequence length filter, by default 0 + seqlen_stop: (int, optional) upper bound of the sequence length filter, by default -1 means no limit + + Returns + ---------- + seq_record: (str) raw data of all sequences + ''' + handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop), retmax = 10000) + record = Entrez.read(handle) + matching_requests = record["IdList"] + request_counter = 0 + while len(record["IdList"]) == 10000: + request_counter += 1 + handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop) + , retstart = request_counter*10000, retmax = 10000) + record = Entrez.read(handle) + matching_requests += record["IdList"] + seq_handle = Entrez.efetch(db="nucleotide", id=matching_requests, retmode = "fasta", rettype = "fasta") + seq_record = seq_handle.read() + return seq_record + + +def download_database(gene_name, seqlen_start=0, seqlen_stop=-1): + '''Download data to a .fasta file, with filtering options + + Args: + gene_name: (str) name of the gene to extract from GenBank + seqlen_start: (int) lower bound of the sequence length filter + seqlen_stop: (int) upper bound of the sequence length filter + + Returns: + data_path: (str) path to downloaded data .fasta file + ''' + data_path = f"{gene_name}[{seqlen_start},{seqlen_stop}].fasta" + with open(data_path, mode="w") as file: + file.write(extract_database(gene_name, seqlen_start, seqlen_stop)) + return data_path + +def download_sequence(species, gene_name, dst, start_length=None, stop_length= None, id = None, permissive_search = True): + """ + download sequence from GenBank through the Entrez database. + + Parameters: + ---------- + species(str): name of species + gene_name(str): name of gene + dst(str,Path-like): destination file path + start_length(int): minimum length of sequence + stop_length(int): maximum length of sequence + id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species. + permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query. + """ + + if id == None: + search_term = f"{gene_name}[Gene Name] AND {species}[Organism]" + if start_length!= None or stop_length!= None: + search_term += f" {start_length}:{stop_length}[Sequence Length]" + handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc") + search_result = Entrez.read(handle) + handle.close() + id = search_result["IdList"] + n=0 + for i in id: + n+=1 + if n==0 and permissive_search: + search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]" + handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc") + search_result = Entrez.read(handle) + handle.close() + id = search_result["IdList"] + n=0 + for i in id: + n+=1 + if n==1: + handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta") + sequence = handle.read() + handle.close() + with open(dst, mode="a") as writer: + writer.write(sequence) \ No newline at end of file diff --git a/process_fastq.py b/process_fastq.py new file mode 100644 index 0000000..7b794a8 --- /dev/null +++ b/process_fastq.py @@ -0,0 +1,287 @@ +import glob, gzip, math, random +import numpy as np +from Bio import SeqIO, Align +import os.path as ospath +import os +import gzip +import shutil +import sys +import matplotlib.pyplot as plt + + +#---------------------------- +#DEAL WITH FASTQ FILES +#---------------------------- + +def extract_gz(src_file, dst_file): + """ + Extracts a gzipped file. + + Args: + src_file (str): Path to the source gzipped file. + dst_file (str): Path to the destination file. + + Returns: + None + """ + with gzip.open(src_file, 'rb') as f_in: + with open(dst_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + + +def read_fastq(fastq_filepath=None): + """ + Read a fastq file and return the reads. + + Args: + fastq_filepath (str): filepath of the .fastq file + Returns: + reads(list of Bio.SeqRecord.SeqRecord): reads from the fastq file + """ + if fastq_filepath is None: fastq_filepath = "data/rbcL_Qiagen_tomato.fastq" # default path (example) + if fastq_filepath.lower().endswith('.gz'): + f = gzip.open(fastq_filepath, 'rt') + else: + f = open(fastq_filepath, 'rt') + reads = [] + for read in SeqIO.parse(f, "fastq"): + reads.append(read) + return reads + + + +def split_fastq(input_path, output_dir, base_name, percentile=20): + """ + Split a FASTQ file into two separate files based on sequence length. + + Args: + input_path (str): Path to the input FASTQ file. + output_dir (str): Directory where the output files will be saved. + base_name (str): Base name for the output files. + percentile (int, optional): The percentile value to split the sequences. Defaults to 20. + + Returns: + tuple: A tuple containing the paths to the top percentile sequences file and the remaining sequences file. + """ + # Read sequences and sort by length + sequences = list(SeqIO.parse(input_path, "fastq")) + sequences.sort(key=lambda x: len(x), reverse=True) + + # Split into top percentile and remaining sequences + split_index = len(sequences) * percentile // 100 + top_sequences = sequences[:split_index] + remaining_sequences = sequences[split_index:] + + # Write to separate files + top_sequences_path = os.path.join(output_dir, f"{base_name}_top{percentile}.fastq") + remaining_sequences_path = os.path.join(output_dir, f"{base_name}_remaining{100-percentile}.fastq") + SeqIO.write(top_sequences, top_sequences_path, "fastq") + SeqIO.write(remaining_sequences, remaining_sequences_path, "fastq") + + return top_sequences_path, remaining_sequences_path + + +def concatenate_fastq(src_folder=None, dst=None): + """ + Concatenate all .fastq from a folder into a single .fastq file. + Args: + folder (str): folder containing the .fastq files (usually fastq_pass folder) + dst (str): destination file (.fastq) + Returns: + None + """ + if src_folder is None: src_folder = "fastq_pass" # default folder (example) + if dst is None: dst = f"{src_folder}/concatenation.fastq" # default destination (example) + if dst[-6:] != ".fastq": dst = f"{dst}.fastq" + def get_file_iterator(filename): + if filename.lower().endswith('.gz'): + f = gzip.open(filename, 'rt') + else: + f = open(filename, 'rt') + return f + fastq_list = [fastq for fastq in glob.glob(f"{src_folder}/*.fastq*")] + fastq_iterators = [SeqIO.parse(get_file_iterator(fastq), "fastq") for fastq in fastq_list] + while True: + for fq in fastq_iterators: + try: + SeqIO.write(next(fq), open(dst,"at"), "fastq") + except StopIteration: + fastq_iterators.remove(fq) + if len(fastq_iterators) == 0: + break + + +def extract_fastq(main_dir, sample_nb, dst): + """ + Extract all fastq files under a certain barcode/sample number from an expedition results folder. + + Args: + main_dir(str): expedition folder path + sample_nb(int): sample number for which to extract fastq. ie 6 to extract from folder barcode06 + dst(str): destination file path + Returns: + None + """ + for root, dirs, files in os.walk(main_dir): + if "fastq_pass" in dirs: + pass_dir = ospath.join(root, "fastq_pass") + break + for root, dirs, files in os.walk(pass_dir): + if f"barcode{sample_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode{sample_nb}") + elif f"barcode0{sample_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode0{sample_nb}") + concatenate_fastq(fastq_dir, dst) + +#---------------------------------- +#GET AND VISUALIZE QUALITY SCORES +#---------------------------------- + +def getContentFile(pathFastqFile): + """ + Open file and get all the reads. + + Args: + pathFastqFile(str): fastq file to read + + Returns: + allReadsAsSring(str): all fastq faile reads + """ + # open file and get all the reads + allReadsAsString = [] + + with open(pathFastqFile) as fastq: + allReadsAsString = fastq.readlines() + + return allReadsAsString + + +def extractReadQuality(fileContent): + """ + Extract the quality of the file reads. Note that the reads are concatenated in one list. + + Args: + fileContent(str): fastq file content as a string + + Returns: + qualityOfReads(list of str): quality scores as a string list + """ + # the fileContent must contain multiple reads, where each read is 4 lines: + if not (len(fileContent) % 4 == 0): + raise ValueError("fastq file must have reads composed of 4 lines") + + qualityOfReads = [] + + # the quality of each read is always the 4th element + for i in range(3, len(fileContent), 4): + + line = fileContent[i] + + # remove the newline character at the end + line = line[:-1] + + qualityOfReads.append(line) + + return qualityOfReads + + +def convertQuality(allReadsQuality): + """ + Converts the qualities from fastq format to int. + + Args: + allReadsQuality(list of str): quality scores in the original format + + Returns: + qualityOfReads(list of int): quality scores converted in int + """ + # note that here we can't distinguish the reads from each other anymore + # they are all in one list + convertedQualities = [] + + for readQuality in allReadsQuality: + + for rawQuality in readQuality: + + # transform the character into the int quality + score = ord(rawQuality) - 33 + + convertedQualities.append(score) + + return convertedQualities + + +def visualiseQualities(pathFastqFile, readQualityConverted): + """ + Creates a graph representing the frequency of each score value. + + Args: + pathFastqFile(str): path of the fastq file in directory + readQualityConverted(list of int): quality scores converted in int + + Returns: + None + """ + plt.figure(figsize=(10, 6)) + + plt.hist(readQualityConverted, bins=range(0, 95)) + + plt.ylabel("Frequency") + + plt.xlabel("Base quality score 0-93 (higher is better)") + plt.xlim(0, 95) + + nameFile = os.path.basename(pathFastqFile) + plt.title(nameFile) + + nameOutput = nameFile + ".png" + plt.savefig(nameOutput) # saves in current directory + + +#-------------------------------- +#CREATE AND DAMAGE SEQUENCES +#-------------------------------- + +def create_random_sequence(length=500, seed=None): + """ + Generate a random sequence. + Args: + length(int): length of the sequence generated + seed(int): random seed + Returns: + random sequence(str) + """ + if seed is not None: random.seed(seed) + return "".join(["ATGC"[random.randint(0,3)] for i in range(length)]) + +def damage_sequence(sequence, mutation_rate=0.05, deletion_rate=0.05, insertion_rate=0.05): + """ + Damage a sequence randomly with mutation (substitution), deletion and insertion. + Warning: to avoid infinite loop, don't set the insertion_rate to 1.0 (or near). + + Args: + sequence(str): original sequence to damage + mutation_rate(float): mutation rate (between 0.0 and 1.0) + deletion_rate(float): deletion_rate (between 0.0 and 1.0) + insertion_rate(float): insertion_rate (between 0.0 and 1.0) + + Returns: + sequence(str): damaged sequence + """ + if mutation_rate < 0 or mutation_rate > 1: + raise Exception("[damage_sequence] mutation_rate is incorrect (must be between 0.0 and 1.0)") + if deletion_rate < 0 or deletion_rate > 1: + raise Exception("[damage_sequence] deletion_rate is incorrect (must be between 0.0 and 1.0)") + if insertion_rate < 0 or insertion_rate > 1: + raise Exception("[damage_sequence] insertion_rate is incorrect (must be between 0.0 and 1.0)") + sequence = "".join(["ATGC".replace(b, '')[random.randint(0, 2)] if random.random() < mutation_rate else b for b in sequence]) # mutation / substitution + sequence = "".join(['' if random.random() < deletion_rate else b for b in sequence]) # deletion + insertion_extension_rate = insertion_rate # can be changed if the extension rate is different + def get_insert(extension_rate=insertion_extension_rate): + insert = "ATGC"[random.randint(0,3)] + while random.random() < extension_rate: insert += "ATGC"[random.randint(0,3)] # extension (after insertion) + return insert + sequence = "".join([sequence[i:i+1] + get_insert() if random.random() < insertion_rate else sequence[i:i+1] for i in range(len(sequence)+1)]) # insertion + return sequence + diff --git a/rbcl_Qiagen_tomato_5000.fastq.png b/rbcl_Qiagen_tomato_5000.fastq.png new file mode 100644 index 0000000000000000000000000000000000000000..7c45ef9939fdde1b552dffbbf44eca5ef2c718fc GIT binary patch literal 24917 zcmeIa2Ut~Uwk^8VQp-r21p}61L_k0!gO+93Dxl{C8}zq8swUDa-Ks%OR?Jq`-n(<2_P$SjV=uXlQf6B<%e3@a z=?vNBPl$Jq-5+um6H6by|N7->{cmtD3`SL$Tk4wWpBUfRPXGJtMYqNDgWIA%70#d^ zp3aylLH}m#*}Hcx{X1*Uw^$nd`oNg6Zu*Bc8y?J}A5PBt<05YV|3&^A)}*Z}Irhr( z?bf;F>?A9F5%a3y6j>vWqVbWw0aoVtfyv3q9m*j$@XrGo=h2pyqQ@0&lJ;E@F$TFc zPA~V^40dQ-TPvpbL_s+GJ&*f6i~zPOEf`<{c>*KX(F@H`v+{V($y)w!1~d+Z}( z8g~5Q_wyGmQc+gEB_AN;7unU{T5{#e6~!}WLgug7@dTgPv1QAa9J|T!z5!Y&V|m z!oo^RmMqasvV7?8(8QU4Pxz30uu^4$S;?#4mc^fZKGS((fbH0D^H4-V+GtyN8#_8+ zO=^bpdw=Qh6ORuvix*8i+!URES8)Huix;sxeP*@gc-#hCiO8YZ^G^qu4iDFR4pi9` zglZ+4`$*V@4R%%rf4Beg8FrM8W`>=I!!kcvSKa9JK_9L?XG4;b#BOgpv5}pj=Q?4>rx|yc+v&?iTKCVd zuhusBTi&j{&dJ#O9_PlAC3#^mB30`K&oypY-36=oOOveXcb|UqC(|@syY0iX$8Dvd zk8xL~LtWLqeSO|MDj~<8>vX=5GbsxT%*x8*irjR!}?a`Mnmiay+N4-j)Fqe&t zjI>r51(X#B9vyhKOyRIs#^k7Jr24ZL^BXnd-50MbroV7~o#eZc5+$$xkEYRYKUubf zM`tJ&JvtDnyF)c1gz3G-czn1gc`#i+XEDBWUxCb6hg|n8MwZ`IrK%Zq{9blGUkYTW zA~GjBEqaHB?B^w8FDaJ<$u+jOhcDfHq@|Ii_kLxau(0qzxt2w!VZKjqPtS>Gr(P>3 zn(OfCrVU>nJEEY##%ojwJhEcZdMUBW-nzPz2M!%-$IkSTzK|f|U$VW+B6A{P{e=#} z*|TTAWwX^pOiBxXp0~{Q(~H?G34M?3>+8awKHarqhw|ZgQ+D~46};9`(y}5VDwpOg zQcSjPZ0PKaywH_==){Q=5;jd9NB!L0-F00?44nIm4uvTOUYWm&KVt6s^LzE22g*dc zuU?wVp&9$^@xf~`@{N;y9sM8UP0Qkpm9UzjmY-gP>1Afz*{Sh4#$Vi~>CMZR`@>Zu z74ceD{OL~*9v!&aHqsoxz9(#S>+ao<#p@(O)SsQ&p%L?B&LXZ`X581fM2^_Cm7eJN z{8}aM%n9lK!u^g@6GH|b&rIIk;S2YZv^T-aQ;l|=y!R?6r#-pZUkS%z?PB*Z<oVH9m0A4y>Z*YBGXx$~Hu&u%M?=ElYArM7W!a5#^3 zCVou6kQnb7BYzmN&|DYI5SrufINh(T_h8G0a=tSrL1P4xEap)37D> zQ}`jz^(M6$>5I8#BFw8(T3`IK0ITYyVvm4_{dD-y^|eK^lRX^Ou2T~qlWl@^FLa8S z)n@3_x{O60EjW5-XHk%x?{3HbGb^?${;+JzvE!-dI^*oSb#DvkO@@sgMcETWwb4Vhu6m2z_amSlZ!eFesAbpjp~%1)A63F) ztw2EK(@JRJo*H)N*G&sazc3VrDA`u@*ga!nP>TY?!-o&KcAaz^sK5?wd%a3e`JRvg zkIwu|jiFiRJLB2-G1+yjvKDboL_d2W)9dA@8$5nuR^Jb_v(+3OQ*;rAZu4oWVw>N_ zeta@9Hl#Jc#y;mda^#5O$&(K>xCVhj)oGTQ`Wts9R>50ol} zsYUyrI(6#S&6^MBE!)yE+*8j^shq)Zc+FsRacw&2vCAvZG&;RWs3uft>!!EIeD~xV zIOW|HJm{s7dM;8h`)+sURLxE| z>yk||eBoM&QS;`{7Ydg5i^;}|K7;2pKXQAAfsn8;zAw*Ty7I7)^Ju@-h!YarH6CGu zcgtBlIA=cY>m6IJNE>wK7Ck=X>*mHVmCRnP6sZ}%*0pB8`(hEVJO$Z_2BFE~qXL{b z%(3>}HN58C*4!?`4>^SmJ?IYe@wjQ7ZZPDz_e?*BOW$JzFYy{r?8)aposHMexsI>H?^}0fBEjizuV0>5*oHQ#-rv)*{^m;u@ zkK5PIJEAOH?Worsew`i6w(8U&vyQHJ9)fDpo9%P&@R^hZU7I;`WLQX^?Kt|l88wZ7Lm7HZIatFf`fxMZrWtm@McM8bt=1280%{7pob05RK#t> zUel|o$KsVnXb6k@$(!nB41P2n#i_o1>y~Yqi&~^+ApSI^>Vrax?aIr72N5?@Mp=_n zxa5GR>-c^Jsh*z^ARiH^Xe|zuu=@P-mScBsL{1*bdD~c@yiP$>=|W_b0XI8BL#d{w zh7Ntxd%p|$_Saeb>x6yxm?C-<>f0RaIX zf}@>@wc2e0weH1%N*S>U&ccYy1|DyFnpbgg?Pzta<;!ifUobQ{nDKddq^~)0XpN}3 zGPW0wae0KsC^iy-sYTo}+FntuW8=78c3JxX;Wy<_#m#0Po<8Tw{Wv8aIo9yjo4;0a z!-l#mcEwYJMP=o!pFTar?$oiLVyAfW0!ra|RQK)MhkcVs$O@SA3?4(R`k=LS3=Wjd zxLOvokrkg4y+$Qmm2ZWB zuJ+Yc0;;yQw$~?sQq=F=yEmGvH);KPz^Ui;D#~XvQ{y(j{PK%Ny_J;}znwC}405ioOAVZO_3Bj!4icbh zw6>vc=J4mMrn-mr?#(jmE?ZsS8xqeYVsh9k!J^h&rvlFKc+dv z);PBF9XN1cyO@|d!bJay#82Jb0tgfB_9F$>T(cNStAtU%9W7rJ5tA1nt5-bbAAWz} zd>D{SO>w8=g$wPs)pU!xs#8U*KEFDf>+R*W`|O8ZD|l6RvW?7)W{*Yq-eR(5YM0k1 z#FU0A9YUT%vbW`|#Z4rXq-n*Q1R~q;ZrHFF0PX!?%J{8et?$C8a*aZrot;&J<<~h+ zjx_V<;*E(|)JOqhziXUhFZvSb3Ol=8KRaf0q~)=vb%$s$=K#{M34r-1Vu4P!?D&`c z4(HDkP!%n!zr1Kof384g2=aTmx}Ss{4hIv(LQdddiGpldYe{ffhVuo}2%Xe(63(M> zfU?3Yr26I^yLT%Cmff^%?eGG6Xb0wChbjd}qzn(AkJL^7XyC`LcO0}@si2@x7OIpZ z!OLS>+jK5Zs`t&NQPed$?{A3NwcX0i=bphB%vvbH*jpTUL&~A|w4aQ#CWq|!Q@d3| zTm8>}?W?__3X0>B1uI&}R^X~+q}qPZStQLfR`OAmg~Z>!DB?8=;SF)%dTftZlX5@2Ut zoE5pjK74p&gfJrEOTxc&C{RD`&`OAS^!RaEnuCpqaq$m;*j|Z=iH3Q1AFh);cLqB& zvE!KrfK2JF?Mlm+EP1$I+OY*lg8hW)tvxw>GZ1j&%5T3F0e-1uI5`kz2l_e=7%}k4 zojc4R_YNVy7cXAKoqoHNN;ZM?{$jfc!A}?Gt}nDHkPhCV5*C7@^J9|L<1O;{LvGyo z6YEf)gQKI#i=XFR@$i@imqz{$2Q$Zeug>9EAJLTW+m48nXcBP#^E~V>ZCubxQAvr; ziV1F)P#Zqey!8H|Yg~ay{^|(*#f1;|qh@tTN0^_#aA7G{M8|QUw^tEpDbBj_omPxz ztwZxVeDVk$m2j9XKHctd9R&{a-Nao7)T09^D#L+hxx_3V2nh-8b{RiEGCHam+&7DH z_JyB)$Sg)fWJGFGfSA<_gEXaJ`D`|ueYP?of46znhvDJjyLgAbEM3R9_hpE$s$i8AGA-RkofF02VIF$TCY${6oG8s6mC{n>xBuyBbJ>X{(qs7kENoDe z_(o@pzLWZrH7f|mg}+@22WF&McFK5%TGUzRW<;pFlM`dUuU`ESF7MS^oob&T^YX=u zV?Z@grg}~&B$}Sj<=(y6{pDfnk#QV?lasxy%+-2&dI*jBZRL^eF;mw@YIUjvGRMZt zu3wZLtGmQ;tnuCLYcD_TYRqMc78MmWAxevS}HmOmKKV%g&yWB_sBF9pj{Fcg9%Mb#B=xtV9b!o^oGPzQtR|MDyyf*$Y;N zxJ-=h(2NTi?5Yk2b`CjH^r-D)d^=wSc+0E-;-JjKJv=Q=kA!wfj;_v|Ng#^O+)p=3`U*nAKYSAwav}S6;9Zm zC?4bSkwyRlfsEl72FZd45#p5b7oD7(K%eM$Rwb7NZsFsT1C~C4;>O9@xhzO-%{p=G zSAko^#WfIxttI^MKMES$uH&B+mvM7PBmC!rU?85TGBLWiq(sLAc{mC|>upZXp`j`p z&x4-p^Q+Z}1g%?a$*$$;%DX4*Sa)$Q730&eHlY#KVIiX5Zlm{WRl<~+7HK_~eMBvy zP|>|BC{XZPzG6k7K<2ojpr9Z?TOndgS(pm%>+Ebz(S+ZA{k0HpJML_S5L3}}z4UTU zPUGrff+rMbep|XZ?p&uP0Xz^w-X4OE?U9K*%+SzK6C?uym?%k%YcrgQ1H=Q1Sk`5k zj%EP85DPFqKHl)@lU2zW2X21vR3}Y|E9NR z0pG$jGF{Sdo(4^z?;+UK9-X;|K7~)mo2=ZaeiE;n$z%#IskXlF;Op>WVKUs(F+z=FWQ!0mhXsDd0RLdjPaa$=MC+QlrNA75V9>3N@JY;1hw z=+W33-zis#@lP+^!Q>Qa#XcsgyFn7KVuwyjoSosWDo;26s|b(GcY{(@ptMkTyb0?W z97IBPP4I~**t%uQj&0kvRgPZz=_hRy{@UwE)JW$##02@8NowR;&u>8xM4(kp=mD{` zbeed#Yz@C=`S7_&(zu&Qw)Vh=jI=Gmqgp4tMK-wvx&ZzJ-MY2B-ER2HC+#uPC>Tj| zXGpKYtMr-__4ePqefwLVl(zUUKmUB@^_3On>bM9_{9`Q9Q7_^l5o%LPd3bov z0OYF5r8^GX^748F%2ZPXfYTd1fVRmwum(T>{7tywNN=OH*jm&~cU1Zi=2N8{&G1`b zlJn2CNz#pPa7}KXH44Z^PzX#alV!ee6@COH1K!)VW5*E`p=DT?I53lTT~!(=-Bo}1 z;b)$=c*DdWA?vC}X@?S}LOdFRG8J!3MS#nG%FKv8A3>w-Is4%OU~M>He;Bd|&vpbv z{H${N^pk|M6_0N7s+kl%*wGh^EhMH9Lwr&>7E;G z>?h|cj4*{Ogg7cdG{WJ^=Pk3wjo9YD^HM@PcNgYQQjSy|2B)oLLK|S zruluSaY>Mmu+jB_g|^I5+-6y#g&oTH)rAMlLHIs#@=K~9V^$#wp2STCwIG~Rr$z~`RvE^Qtbs} zE>2VppeCn}ot*pMq}0?kcFl2HbK+c!mo|%gEnCP_g5s&0)+Ns0<)*(?ilX+p^XYVQ zK_tVbF4=YI|8sKGc}Nv?=e9}OGNW=iuAAOF4@h zXC*`px-XqPJQM#dId8_gtVO96rg^ln#yQx3BSk{J&|VV!l=i)Y`=77+w*uv;igw!h z6I0{Ovd9zSBg4ZXP9wckq99L2f))d+2`wxmQ+1D2q( zi_G;XJlOT6!YEnN(rM-kQ2C zR%(I>(&?ZqeI)Ip5HW*5D7XKzO7{j1a;D1!5A(kYH>PDDD$%Gm1pVsu1hv{WAt6OQ zmytl8?f(pOR6VutGFMBz`0=?FUtfs#8dj~vOnbhGX7>PE&<#rpqlx}?Yzqx8jcE*= zdLw|M5t;u1ZvAM=?hBKk?M;xRh>zzX06c&3;&KwzK3^5sv3c_WP`4Q}qb-|6Ov{gX z?IuQmUq_ROc*^3aHB5206uc2!H!(qt-a%LmCc+%083}ghIzI-2_~k`i@3k9+IOPZZ zY>yLw@qLa%w6YBICGu98R}OJIEAy&Fm15a>m=OQUP$~~Jcq<(7TD(^50kZcm?(SLv zR$^-bzi$E?Ze?D%cC8GlM%x77QzhNeF5D0g3KXd-#(Dx~?p>>GN+}g-^P9c?CzRNe zGg(qxe9OnDtrxo!H2{Zji<%(xy}&0=2<>}=eWVf*vW0YCp0L*w!VQWbXmgu?x^&47 z`bccW^Mlgn!k#b&;t5bh_Eo$-hC+ zurT@*>KC9-|E7X~=m6JA4J4T|)Z)8A+Os4;0I`O$@1E6Xr4o=>6FL$Ll(T*F=GTtv zWSqp<{&csDuICF!RnmIj@p4JnhWNXT8Z#AQT?GkjK4?c<<8!GHLCTxGyu2bR zx=@z~1_$$si&q$|l-|Cb3G9_oB6`nE9yjqj?3F>*}NKFy5ikJgRLB~$8_;q-6BpRi@f({jcAC+gP4qRI& zsrq+72iR)mwDVSrxONk<@wBT=7bLRu(|qn-horM zY^!`JGzXD0g&XX;Yy2dqbMju9Lwhr?B-S_{MLAAb7CVOLm&=z){wan0VOX|sy_9ea43(f*cuWeVAIA~sFo2I;H{ulX~sCS-Gy<6Rx zj*7uY#yKUv3L-F6t`oRU>4SZ<5YV=5-1xos7P)tqq-UglT7C|zRQLY#j~;?u<})3( zLS2OfzkU1m1b@5CH7}YG2rSzA>|e^HtS~<(izrYP+Iw_Qk}op#hvhTY;x8wqT6&Ao zS~Jf6IlZHHbd&s8#=oMD%MO3=VfKu5mdm09>E(Yhy8rp#ndB*@RQbJItn;~RCEY}p zDs;-}AO1bgm9(mAJfqToNSXGMhW}?7W~MXq(^zEYhLrKTq&p6!^RS3rwt0V(VgELl z{$Jx5n&4V1?WkNJJ#?RYD*fWxO3499Rb<}Ah18+oZ zMHD&?jx*l2Rk8D9{84yqRN=jd@Bt}w=H0FBbny3>eSCsoP0T1sk|vKHJt8N;>sPPN z;I9x*t_k+u!((Ci7Q_Rn9YmPk;nNC2QFh?qLCsv~rlrtx{lsk!qV7^TcFcpQi)|MkvLqDK^FY%YY0LICH zPbUQdJiZv^`3fjfOfb$T5G0}2JweH+J1_>37O5IoqtU0$VwK@at3)xesK8V$-0qv3 zIx#aQ-8uWi)zi*DgVYe22?~7v{Q1R~IAlUJVhuHNIsPX&&1AioQgLzWMqiqFG4bLal2VcT~j+jQh_(%r#HuNoA^^< z^k=r7YZ-VYN;A%V`J!zVvB3z};own^d%;<_^LRh`)^PgJD1z(IGD6YDfZn_P<>KL zQD!HliDsW(lUmg%B3@Q|I6nY_;W32#^7b#EUM3Gtmjsp^Ge=c!?bJ|2(QL;V4mZu` z`ufdGq8x6S&5xGC>JNrD1qZk& z9yxO5Xg>7pTQ_c;EZeD+B$&wfMymF-&e?+fEB|`lK7zh=hs}-OHn1FzwSi|QmUK;{wp0pA&AN|2P zunZfKa;=cpb%?m-2_-X1S=eom={uS8IXKk70HQVsC4~o~C)e&XH}D?b=S}C!2AgmC z!_NTsCEcMaBn7?lfs@PG#dm%&V)|3w)YG&6UXAq+Gr$dtS)-u!tE#8bFeC9;#-f&?nhVjyU_vozMJdo_ zKAoK1e7*2lAnyU_115D@7YX}OfB>(QlFCP~j$wWne;pnIHaJ(?{+1K@{?b|ya~+|E zr~t*u-4i-MkkkI^^ry9Boz@h_o_BBvg|0CQtWUyBXKkh)y{}NkzEF?MlDYIjCBxoYzLzJ80{$esrU_r;ZTQf!MB2&6r@apT5X zQ(WaSsCZdV$!U6~ZnUw|Z4zX^c05%Bc+2lDnNL4clK99VSARvUQM>Hah(baSm8(=O zu929&dR;dDq73v0RHTM$P&Lpdt}dLuV}>Yy?VNrDW@sqZ71_{mmUDA=U%vkJx>mTY zxwB^*L;I-6CUF6KThw;?;AdUSO`I`7&Q0*m@s0x~2B%LGPfeCFl%@R;VNz42r!NzA zG$Z}$*cyv8n7$Sh!`0iH(l3GndH`!}vuGL?Y^Z&kIHMYmMeav1#!-=4iSIyfkvOQX zem7O?>k_2kN&EvLQ;~NowQ~CMbwLgIP1uBB$UB3@B{!3yVNh-_LQ-<5@Yh?=!yZAU zRst_8Wh;`-*RNmSWlrdY;kM?E#Vkk4T(EFTk2H99nNNoyH(&gI3gpc$=<4=~2%MI= zx7MXJgo(_}iO;(v58!jdBH)sYW!+xDXgMUg#OVdklFLmLZb3FIL$s&li#w5nE@P_! znL8jA_cR<3o_|?z8Ok)bomk^lysF2LZEfto+ODgX(sqCosQT*;x6Ey*G%eiR3LboP zu+n0nE0y?vC9fxlK&{{f&AX;)KDTyn3!_4gbrMfW$ND!roc?{=rZ?TLFBbD*O^S?GZgFosF@E<{iIZk%cQlZ4XEIqBO%_Px6jq ze~53&?^~;(lCwWA?-<_x={2(O6G3ID`pc~a8Ld5Ev^K8h!uj(pxK}PnL39>|M5b%h ztVM4UgOe2eBve4RpFkKS+4aB=KeVv)Tu8P7Yf#rUIDNlK{8$xtKn8w(`2{O?7LxB2 z-%NHp*is{Mp-I=YbO%WKjv43HJ!Ybmog!y6k% zo~*K9e++zezE%`nT_JA#OmMXXTmIOkDfAAZXLR(|tbU95Of83C{&(#@2qp*S{cSUj>e_17mO0=zlEwHy+P3Xl=A7+eS0so*Ai6V?Y!RqZe0A}A0(Sy|b9KS?!^ z3SLiOCP;^e)a1j{hsbO#O^c`RB~zdmtkYhgr9v)(Sv=L`k`5q zXVZ&EKtOj zLp$fqn|I{!;p2fvZa1{I^Xhw~YJJ_u6j&guWria>QEZ_tx|xUPuor+`_NK3w7?GzS zGB%_JNWB7LA#O*j9F zY=9Tb>YjK(Z>2>hOxH;dH%839|Tb#r!{t+`?_+*Bi3`$;aB?jOy3=XaaA=n#4s3% zKV;nB$9jUzrAvfe_O&(5U8;<_%eVh3LNe;iacn_wwbxEfCIY%W1jh!Am2EJBd|g%g z=VhDS51+gW3?OBIX>dk!FukxDniXG$cGI`Hjc=))-4EJl$9LcD3s;N2!vp*5iK$U` z=JK^`JDT3pXBbIVBVAjpvRAW);A|)ZjIAnNH2nvO2Qz&4WmYv_T-RmKCz?L}&XeBR z3%4K%R743j`%DW9dtF%FhK1JzNj`;0$wNQ{*xM)}HOUo`yXy=or^psw-7_6gENR!s zLJ>Cq%8k2eZN>fJ!>Os$@0nqq>}3Bnw=wKD{wv)wdclxDzlvM$xP-IMG`zV9LF~4t zrzeTDy~qGIP}K13CVHZOQY@U;hV>>^y1<&YB=R?H95}l>Knv zfHBBZGVb8**^TxalXD5yGF3TfZl6DYCczq!<{0Wj=;?8Hj82~p!K);vTsgAG-Mg!u z2P=#q%GGsF^-eEht?GzxH5jWx{of?_3N?os4xkr9frEgZ-+OgB9RHpp-= zfF>|*cs)T?m_9W*<|E}01Lz+_B@5D93AV`q42eg)&UM#B;#*s3rx7I#zHHiY2*VUv zwVB1ejk!p6{Gg!~IAn$&&_xi&Od(?)k(0|G`wGUA48Cn)hrm7yPjVnW(j??kmo;NE z^jhqFE}MUW3=UHYzOJJ57Zo3v=+ZE(;4i53m#Zao0-p_Kj*K<|3MwRp;F)3*(ucLui`y00)7~+5C&SVaI_xHZ>;PY z8e##asMsW93!&&})6CdHtDJR>7*tT<;d+@a!~(-0Y5V$LrG58-di?M~ee>qcp{kLZ zpc1_xRFdRNO#t}M7{fS$famf@j>O!7D7;NtTIcQCx5?#eWnH8y2Nmv%eNwa;IS19l z4n@*g;dTuisVgh_{>*{TnAqX7$1JR~GLe^`|Gi^bl#UkVaqK2#IK%#snT=q=h&RmV zDvQ?B^HL^rCfo;sTMz^!&viUCq1TLUfl93?Yy@Z&n)7GHSZKls-s&Q7v$0-&I*#(t zyxoNsgi(dU%w>g(iwkY8+`BNmfTD@$2?^Ood@c-)$_UpY=Q`AZ0MwzadhtN4iPA}( zu06i}_Bgx<#L&VN@G_-U`!B>~$DwNb%}i;>{^LNv#6d6>Q9#4=9tNpL?!MTGtqT4i zRxg#pk!6@ouYE(nrU$h%;cP002eC9HLsq+kF=Mfkv;@h&&_D4YX@^a*(I zsPhT1LNy-%F9L~N)Bb`59MVAw0p@%|{r#oz-P46|hkIpYWY7;$2&%ugD|b5HnJ?hb z_pq{G_J&*vs8^3LnaO6cT?X#_$$agzyHtT`30w5Oi63{}HnoM3q|&=_P2H&F??LQ; zQKz39i+T+kqB;sUC-cjSMH#HO!`-@7{`OBqx&Ld3#D85nCc&1GB#xK))^224m0L!; zec}IM^z%P%Kb_{^>jroHwE(|C^gl&RXI?@4ltzhwCOc}q`j}uA4qY*&bhZ9&>xRSN zE2$<0_fQTKh>pp>fwy&45=E}_%E!phu;)Wjw0lBVm_a{$6r z$Pd3Q9Yt|eie?#RG1^0vQ*4c?HPZx{k8(2z6O)&h7g2iTF{U5D(!nJ5_}y(@Q?x}8 zBS!66(9@ZUsK(J)6al#lv|xYOMDSlSwy(_aA9*bw4nB`tjL15StO^|(Hrg~!6Ue5G7rLT{qG@IpvuL2cNJ#AQn+YxPXq3~oP zIP!~tQA&J~%mdQxdn9YCSeTj-F~&@I+qO4kwHh89%dM=e3`^FwwT-irONEw? z1&AIpokrumj2ROWY2&@Q*nhsLfNz-HLoJYdLH}67vs@irZ>5QArQ;P(e|(8-cZxdN z?wcx1-_p>A2YQdu)_VqNN;BOtp8ATex&Qq0VZ{W_4lJcm50Fhk6TXjcf|=9 zHFN{1T93`Ji&z z2BIxY1Njaw2XEi5JyVLlazg>hOsVbY>LA&A3eAi-kg%LD&veF!|JA$D=jS@58pdo2-Lt0K!3QLcN-T!sqFt+ z(mDLn{GiqV``#_L=a@m34E>b$%BHZ>-Spp~Gft~}ZWmC&@vs|-<9ZB^i_vBNp9-S? zPF3{})U2b$#=X_aGtEy6ZV2ZCF>?lYczm$vM*8 zK7#s}sY?WO8&R+p86!dH%x~#RU~H&8bs6aD!;Bfy0UDSq0NHal+{lESiT?0Mvj{1s zVAGN}S0;UQ4Vjr$-K06G!}Dquy2%>9+923bfa{Je58Zd$G#xXIK(sZfOo&8xO!VkT zX{j>wKFW%4-;0-1vtIe8ta{BaIU5)bt(q}2dRGd!j2woE7Rd-^n&Ye3{0r-sB@7bSE_!aAgred*Gr+jsBYtvAA5flCiLTOW9omBA-HlhN^ZBq&uf zEAmCb(yYjHZKd0ByWqH&6!vZYvn@!2EX6Rms&`Kf*K?rXZM%$&zuxyrDkdgjB*&t% zv^lb(>Wco7bI+fBEBWbsn&fJITU-lBDJ6BqxGQdlXV5+&igXWweossF5Fx$}j**!pg#V;?IP6znTF1V1!h8y}4|^*hn}Plubh zth$HohD;KS2^mx|kD{Zgs(NXcC4AhD(C>(cfAC-f*tqG=hfu-4$MJ;fSMlLpQDXyg zoT-QY5{FidLU8O#ZeETN1K=FPk;(u^;_ui|Tnju8fYa%N*QM z;AleN@N)e8+g8$fy2>r*YQaI}IMfvhWs!W{)L@82XxEg-8E%N4Djy+(E2uA{AaHC4 zcA#!{)TLD)#Jf)W?lCJypG`1m(^`Z3xY5o3*mA&i{ZPX%J@q;5#OFc=h`%soE_Zj= zK5{*2S)?13S?-+akXIM=<;^C~yHN2{JgV($oef(GADXv)YyTM7Wz*XTjo9qf`p2N{ z09XhDOvIYJ50JG0ZgU`6qijppNw9K4DB^bmKTr4JK=KjJw%obVViBj%D=C0eHSPC3 zzor#K0xAhSx{CD-+>vjy|AqIfUq1UbX_jz{{l)%%Tl8WO*JD84F=)ZFQBq4&hwtex`(Fog09i&ULx3Z3k9oKPR*Rs=ydQ11^$1Y<6xGpsrC+P354X>_OGEq8keS z^Vo>SwVrQy2`-X39UK?aD*>_^KG}xJ2)AD0CN7~M&gvWjGfWoDKNuhYOIzGVB?^&~ zSqy1B2@x1P8aaCMIZe>jdeaV7RiL@OAI*!VXj`hdV}-kzK-0I6xJ?kO;1AGD#7$ju zFcR?M#zEWq*DrA}d~@SF5CKKbL)BX$sYAv&gC=^%rh7&_JrO>eNe+SUqTaYH`8zpPtwaWO^?^~m%O=f|Fa8gHO4=nr(7lV zoEgd!1@b1qZc!F%q+p+TSyH{bCSAmJ(rIESbLuwwvqL25LNbQYEFxl2yisfO08WC5`q7$?L3|0^* z&7foVgo4lw8U$0eAq>Un_sS3Zr2b9s9cr@ce8E$`I<^GW|K$OK%JNmK?2leZSx0dd{;Q5^PQ(SQs%)atc-m|&9mU)i(w10EGy`MhXIg92~A>DywQ6@K8Xc|xU(3zYY z?pp@tl7s>1+Uc9yXEC%n0sXrEge6JBFd7(Z9-v!>2umCWYkMv8PxbYWk?!mhtxEW{ zlPvcKX#I|}Fp!0;wKRTUnS#_qG`sS;phrd+WjYJ721hNBkRUY%M$`CtxeWD+%S?0!v!t5EKk0yf*>|@ZmxF-nM;v>y^s!zSa`hb&EnyJdU$#S8ZQ-3HF8% z$XXSXZdN^?574fIA?=M-I{dwg$Wo1H=*Jl*oJ;5?-U(MH1q1kM2TDX`-IIE> z7{Xm?yr~+3fE*TRTC%}V2~)g7wedXchY*n3CJ-9KPCUL$wKontA|ogU*JLk{2ESk8SfM%(4e7;r~F zI%TP;0w$$WoH-jQ7aH=#uaoiyos|IVMF>;PG2;MC)HgpqF+pRQg8dzjpcOSQp&INS zkSS(99T9hl&Mn9rhLfTnh~o_|i4&~^4-jna4|%&*F+au{d18_|rRE$kHNL+8S{kQ?)tZ?k-@HzdsyTwZyP zIHEWK1Mpjk*E?}0+W|GaZCg{O$Z!ISl2?C$>!cDMvmEi3P#HNM$*2rjufDj1Mo*xJ z1b@99W(XD!JsJ8l3K7JjQ{Yb(ea>L?rlaWP1XiqXZ#PBT(ls6>jM-QQRkOVrLv2n1 z4;Z8m_4NrOdcA$NM@Xm`(dMN=2SzLGqBdZJrQ9B2z&Byc&a>r2|HyR*^iM|&LAMkF z=&%WOK;O{xB!Le@_~L6kHYh03NDR?`kJm96mC%Nah(a?E$~;#KzA*6BfXrr+?a6JC zZh`u{6%+neBx*fAxJcN#;mukW!b{#QUbXj6f`glrq@ctieP$bUpg7wDQ=d@wY8%MA zo8G1&Tl0&cI^NUQTwu?60lYOev07KX$Bdk7Jbm?^vMC;(_^Zlc%B@#w`Cx5b4#R4H zI~&G5n`|1>(BD;EdHCI+4Y$ef-il;5jE~ujbYH?CEIc30j)>4o zd>@#ARE844-)f|4m7dd0{Q^WgwQ&S3OH*piwEq?PdK)l3>;M>#6+m4sn1J%4ad{=! zUu7zPFlj;g$>AmG7-quH-pm0G9Ua+9@Bv`_g;;z4+37|rtSfz}9eutFkha>;Dla}5 zhsqIqyuhHNcZhuo$dlU4makm7g_4|kmAaR8deEq*EpijK`5m+}Z#IT+6R{?DMLWV# zIGjrRL9P~J!h*K>3+gEb$;EJ^MJ#?^+WBQxvbRd$H!a zE3uDgJ|J|aE1sTexNg+>z9>_&k(wfHQ|v5{){BN6D-0xuT10nwq%S26pap7*!Z@6A zxT`T{ha2rp+IAO`e`(s8k&#h8^*%+s64kjf+OkW=w3q= z?)8LNiTl~yG~ue)_zM|;F@0(a>}VhyjzI{4nOi%-?3+1r2w%`#Nn*$le6)K!1u#&8 zB$qudBizC$0HRj+$7b7nbZ5uZL^z`oB81V30P}t@8PllLr^X6o-AzS+z6Lrg)v(1& zusH@Oa)Z}6j)M|yDy>l~2=_kFXgMN(Y#A*0y(#7Q1}0dT%7|f?IR+I00;V^j#;cL+ zT4Ow6*IEzC*3uoI7jiwKF*GA*Cwu2Q)OISC9k@Oup@00cu z?~8U0D3$~gi0LQ^RtN`@@p^(YrUpkZIj}Hy2du`#QEdMh8CU2|=}-y@pHfFAy;K_H zwqk?(Le5862`ceGJd^-UJwiMPR|t?%>j*{dJPJpBKW^X*>PkX^=env|75ivJFb%sz z5Q;M@SdTQF+c=UBWPKC(p3n8A_4%-Mx_WO|aIqNFdVUU}Z-U7J^WmteaAZ&>5+h zDU&1XrOk^3gJvTpMRJ79NI3GsP>=06{vb=m;?WaxrM7zGBfadhr#rXw%{wRh0eQWy@%j%|1E0G{5iw}DAL!NlCT zObn=DYJ@gC&Cdo6u+0WA)~pqnjmGiaLKMNEBO)m<2C*2OB(|{Wq3krK`4wQa$VrET zZAuvv-xhuT<#X9F1a~r9v5qc9fVeou?~1WJJJ4Ami@H_fhK=DL+>Tt>GcX_m-0GUJCP2p= z^x~FXy9$?YS9}Gf;4S)BKQvx8L$Ti!$kG+U78M1sp~0Dza-X68T3{ZEDGF8U=+VHw zyLlQ}^pr>PsN82FeV0AG=F%87HLLeGwZuq~#BW_@`89Xke~9^jXLhFrUU1|`i zM6vM&@q&X8`wdaV@Y?}O2?Hnyqihz2*Q^P!QWz9&d4U0TZ4gWp**Q5mhlOzJ(fmiK zciGaVr;z$0Y6J0(gi)vvVpYOSNqRFSIA`r2-n2GZYe0OEv$gG48JfqaYdXK$J6A^s z&ZgHiXaW6c)PdM#exNuo#~hV>8yFCpG+bT%ew``G*ijt+a0HvCJCS`Ta8khRPxt?K zejKEBN@cbZEXff-d<=@5>uSu{G+O23m^tL(3=YI=p!1 zXV#&>D*^%_wqP4Z6rcy|4i9zyB*v&{H=&|ufrdT`!7Q_N;F!;DCeU@f`Q;%LlG}Fe`m)mYSKfIX9QN?vS+eVvfAzJWb^F~> l|JQ6J1gZZ$=I~E)%kb}#p3yJ1pF}j~0r?-IzCZcX{{WGMfsOzG literal 0 HcmV?d00001 diff --git a/run_command.py b/run_command.py new file mode 100644 index 0000000..1d91877 --- /dev/null +++ b/run_command.py @@ -0,0 +1,64 @@ +import subprocess +import time + +def run_bash_command(command: str, logger, windows: bool = False, blastn : bool = False): + """ + Runs a command in the shell and logs the command, execution time, stdout, and stderr. + + Args: + command (str): The command to be executed. + logger: The logger object used for logging. + windows (bool, optional): Specifies whether the environment is Windows (True) or UNIX (False). + Defaults to False. + + Returns: + tuple: A tuple containing the result object and the elapsed time in seconds. + """ + if windows : + # Adjust the command for Windows environment + if not blastn : + full_command = ("wsl " + command).replace('\\', '/') + else : + full_command =command.replace('\\', '/') + else: + # Use "bash -l -c" to run the command within the context of the bash profile on UNIX systems + full_command = f'bash -l -c "{command}"' + + logger.info(f"Running command: {full_command}") + start_time = time.time() + result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + elapsed_time = time.time() - start_time + logger.info(f"Command '{full_command}' took {elapsed_time:.2f} seconds to run.") + if result.stdout: + logger.info(result.stdout) + print(result.stdout) + if result.stderr: + logger.error("Error: " + result.stderr) + print(result.stderr) + return result, elapsed_time + +def run_command(command: str, logger, windows: bool = False): + """ + Runs a command in the shell and logs the command, execution time, stdout, and stderr. + + Args: + command (str): The command to be executed. + logger: The logger object used for logging. + windows (bool, optional): Specifies whether to run the command in Windows Subsystem for Linux (windows). + Defaults to False. + + Returns: + tuple: A tuple containing the result object and the elapsed time in seconds. + """ + if windows : + command = ("wsl " + command).replace('\\','/') + logger.info(f"Running command: {command}") + start_time = time.time() + result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + elapsed_time = time.time() - start_time + logger.info(f"Command '{command}' took {elapsed_time:.2f} seconds to run.") + if result.stdout: + logger.info(result.stdout) + if result.stderr: + logger.error("Error: " + result.stderr) + return result, elapsed_time \ No newline at end of file diff --git a/test_material/rbcL_Qiagen_tomato_5000_test_file.fastq b/test_material/rbcL_Qiagen_tomato_5000_test_file.fastq new file mode 100644 index 0000000..c22edb2 --- /dev/null +++ b/test_material/rbcL_Qiagen_tomato_5000_test_file.fastq @@ -0,0 +1,8 @@ +@632f5334-2c26-4f91-a611-bbcc85d83ea1 runid=84dc383f16e4874203b6120bf223ea75d3815a54 read=14 ch=417 start_time=2021-11-26T13:56:48Z flow_cell_id=FAR79813 protocol_group_id=rbcL_Qiagen_tomato sample_id=no_sample +GTATGCTTCGTTCATTCAAATTTGGGTGTTTAGCAATTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTGGATCGGCAGGTGTTTAACCGTTTTCGCATTTATCGTGGCTTAACACCAGCGTTTTTCATTGTAACTTAAAATTATATAAGAGAAGAAGAATCTTTGATTTTTTTTTTTGAAAAAGGTAACCGAGCTTCTTTAGTAATAAGACTATTCAAATTACAATATTCGTGGAAAATCGTAATAAATATTGAAGGCATCTTTTAATAGCGAAGTTTGAACAAAATTTCCAA ++ +$$%&$$%00*268%%)($$#####%++]<;7-$$#$#$257166>>8;7820.(''''')668.0+,-0.,(()''(,,++55]<<*')10024=//.*,*(),/)+//00347%&'%#**&%()'&'(((()'***+*33373356*,+')),0,)& +@c4124d21-64a3-4b8e-ac5a-771ea99ff56f runid=84dc383f16e4874203b6120bf223ea75d3815a54 read=704 ch=203 start_time=2021-11-26T14:00:32Z flow_cell_id=FAR79813 protocol_group_id=rbcL_Qiagen_tomato sample_id=no_sample +GTTGTACTTCGTTCCAGTTATCAGATGTTGGGTGTTTAGCCGTTTTCGCATTTATCATTGAAACAACCGCGTTTTCGTGCGCCGCTTCACCTACAATGGAAGTAAACATATTGGTAACAGAACCTTTATGTAAAGGTCTAAAGGAGTGGCTACATAAGCAATATATTGATCTTTTCTCCAACAACGCGCTCGATGCGGTAGCATCGCACCTTTGTAACGATCAAGACTGGTAGAGTCCATCGGTCCATACAATTGTCCATGTACCAGTAGAAGATTCGGCTGCGGCCCCTGCTACTGGGTGGAACTCCAGGTTGAGGTTACTCGGAATGCTAATATATCAGTATCCTTGGTTTGGTACTCAGGAGTATAATAAGTCAATTTGTACTCTTTAACACCAGCTTTGAATCAACACTTTAGTCTCTG ++ +%84695532.,-.:+&''+'&(%'('&%$&&..]2233-,-58>DB]479;>9899)))..-)((-0'(7879=5;8?A>:7@67=<663/.-3-3'.5=;2++;9:;99&&()'%*%%+++,,/>@9?@HABA334+)*&)+-.0/25:750**)(()23581100/0-/44=8]2@3333101334214778743.+'(%%.45''':6:.,344095-+,*)())()(&&)--,.**,+.&%/7@>;2012441+*+))68123?=660-=&56')&522688<]1+ \ No newline at end of file diff --git a/test_utils.py b/test_utils.py new file mode 100644 index 0000000..cc5d326 --- /dev/null +++ b/test_utils.py @@ -0,0 +1,47 @@ +import process_fastq +from Bio import SeqIO, Align + +#In order to run the tests, enter in terminal : pytest test_utils.py +#Warning : the 3 last tests have an extremely small chance to fail due to functions based on randomness, running the tests several times can confirm if it is the reason or not + +def test_read_fastq(): + filePath = "test_material/rbcL_Qiagen_tomato_5000_test_file.fastq" + reads = process_fastq.read_fastq(filePath) + assert len(reads) == 2 + assert str(reads[0].seq) == "GTATGCTTCGTTCATTCAAATTTGGGTGTTTAGCAATTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTGGATCGGCAGGTGTTTAACCGTTTTCGCATTTATCGTGGCTTAACACCAGCGTTTTTCATTGTAACTTAAAATTATATAAGAGAAGAAGAATCTTTGATTTTTTTTTTTGAAAAAGGTAACCGAGCTTCTTTAGTAATAAGACTATTCAAATTACAATATTCGTGGAAAATCGTAATAAATATTGAAGGCATCTTTTAATAGCGAAGTTTGAACAAAATTTCCAA" + assert str(reads[1].seq) == "GTTGTACTTCGTTCCAGTTATCAGATGTTGGGTGTTTAGCCGTTTTCGCATTTATCATTGAAACAACCGCGTTTTCGTGCGCCGCTTCACCTACAATGGAAGTAAACATATTGGTAACAGAACCTTTATGTAAAGGTCTAAAGGAGTGGCTACATAAGCAATATATTGATCTTTTCTCCAACAACGCGCTCGATGCGGTAGCATCGCACCTTTGTAACGATCAAGACTGGTAGAGTCCATCGGTCCATACAATTGTCCATGTACCAGTAGAAGATTCGGCTGCGGCCCCTGCTACTGGGTGGAACTCCAGGTTGAGGTTACTCGGAATGCTAATATATCAGTATCCTTGGTTTGGTACTCAGGAGTATAATAAGTCAATTTGTACTCTTTAACACCAGCTTTGAATCAACACTTTAGTCTCTG" + +def test_split_fastq(): + inputPath="test_material/rbcL_Qiagen_tomato_5000_test_file.fastq" + outputDir="test_material" + output_name="outputTest" + assert process_fastq.split_fastq(inputPath,outputDir,output_name) == ('test_material\\outputTest_top20.fastq', 'test_material\\outputTest_remaining80.fastq') + +def test_quality_scores(): + filePath = "test_material/rbcL_Qiagen_tomato_5000_test_file.fastq" + reads = process_fastq.getContentFile(filePath) + assert reads==['@632f5334-2c26-4f91-a611-bbcc85d83ea1 runid=84dc383f16e4874203b6120bf223ea75d3815a54 read=14 ch=417 start_time=2021-11-26T13:56:48Z flow_cell_id=FAR79813 protocol_group_id=rbcL_Qiagen_tomato sample_id=no_sample\n', 'GTATGCTTCGTTCATTCAAATTTGGGTGTTTAGCAATTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTGGATCGGCAGGTGTTTAACCGTTTTCGCATTTATCGTGGCTTAACACCAGCGTTTTTCATTGTAACTTAAAATTATATAAGAGAAGAAGAATCTTTGATTTTTTTTTTTGAAAAAGGTAACCGAGCTTCTTTAGTAATAAGACTATTCAAATTACAATATTCGTGGAAAATCGTAATAAATATTGAAGGCATCTTTTAATAGCGAAGTTTGAACAAAATTTCCAA\n', '+\n', "$$%&$$%00*268%%)($$#####%++]<;7-$$#$#$257166>>8;7820.(''''')668.0+,-0.,(()''(,,++55]<<*')10024=//.*,*(),/)+//00347%&'%#**&%()'&'(((()'***+*33373356*,+')),0,)&\n", '@c4124d21-64a3-4b8e-ac5a-771ea99ff56f runid=84dc383f16e4874203b6120bf223ea75d3815a54 read=704 ch=203 start_time=2021-11-26T14:00:32Z flow_cell_id=FAR79813 protocol_group_id=rbcL_Qiagen_tomato sample_id=no_sample\n', 'GTTGTACTTCGTTCCAGTTATCAGATGTTGGGTGTTTAGCCGTTTTCGCATTTATCATTGAAACAACCGCGTTTTCGTGCGCCGCTTCACCTACAATGGAAGTAAACATATTGGTAACAGAACCTTTATGTAAAGGTCTAAAGGAGTGGCTACATAAGCAATATATTGATCTTTTCTCCAACAACGCGCTCGATGCGGTAGCATCGCACCTTTGTAACGATCAAGACTGGTAGAGTCCATCGGTCCATACAATTGTCCATGTACCAGTAGAAGATTCGGCTGCGGCCCCTGCTACTGGGTGGAACTCCAGGTTGAGGTTACTCGGAATGCTAATATATCAGTATCCTTGGTTTGGTACTCAGGAGTATAATAAGTCAATTTGTACTCTTTAACACCAGCTTTGAATCAACACTTTAGTCTCTG\n', '+\n', "%84695532.,-.:+&''+'&(%'('&%$&&..]2233-,-58>DB]479;>9899)))..-)((-0'(7879=5;8?A>:7@67=<663/.-3-3'.5=;2++;9:;99&&()'%*%%+++,,/>@9?@HABA334+)*&)+-.0/25:750**)(()23581100/0-/44=8]2@3333101334214778743.+'(%%.45''':6:.,344095-+,*)())()(&&)--,.**,+.&%/7@>;2012441+*+))68123?=660-=&56')&522688<]1+"] + quality = process_fastq.extractReadQuality(reads) + assert quality==["$$%&$$%00*268%%)($$#####%++]<;7-$$#$#$257166>>8;7820.(''''')668.0+,-0.,(()''(,,++55]<<*')10024=//.*,*(),/)+//00347%&'%#**&%()'&'(((()'***+*33373356*,+')),0,)&", "%84695532.,-.:+&''+'&(%'('&%$&&..]2233-,-58>DB]479;>9899)))..-)((-0'(7879=5;8?A>:7@67=<663/.-3-3'.5=;2++;9:;99&&()'%*%%+++,,/>@9?@HABA334+)*&)+-.0/25:750**)(()23581100/0-/44=8]2@3333101334214778743.+'(%%.45''':6:.,344095-+,*)())()(&&)--,.**,+.&%/7@>;2012441+*+))68123?=660-=&56')&522688<]1"] + converted_quality = process_fastq.convertQuality(quality) + assert converted_quality==[3, 3, 4, 5, 3, 3, 4, 15, 15, 9, 17, 21, 23, 4, 4, 8, 7, 3, 3, 2, 2, 2, 2, 2, 4, 10, 10, 60, 27, 26, 22, 12, 3, 3, 2, 3, 2, 3, 17, 20, 22, 16, 21, 21, 29, 29, 23, 26, 22, 23, 17, 15, 13, 7, 6, 6, 6, 6, 6, 8, 21, 21, 23, 13, 15, 10, 11, 12, 15, 13, 11, 7, 7, 8, 6, 6, 7, 11, 11, 10, 10, 20, 20, 60, 27, 27, 9, 6, 8, 16, 15, 15, 17, 19, 27, 14, 23, 5, 13, 19, 22, 24, 26, 15, 15, 15, 17, 18, 8, 7, 7, 8, 8, 5, 4, 4, 3, 4, 2, 3, 2, 4, 6, 15, 17, 52, 37, 37, 28, 8, 7, 7, 5, 3, 2, 7, 11, 11, 9, 4, 5, 17, 15, 20, 22, 22, 34, 60, 60, 14, 12, 13, 60, 60, 12, 24, 22, 15, 11, 10, 15, 20, 9, 10, 7, 13, 13, 16, 15, 19, 24, 26, 25, 24, 23, 20, 18, 19, 20, 18, 10, 9, 14, 13, 13, 12, 13, 9, 5, 9, 4, 4, 5, 4, 6, 14, 9, 22, 30, 27, 24, 28, 26, 5, 5, 5, 9, 14, 16, 15, 12, 13, 16, 14, 16, 25, 30, 32, 25, 24, 25, 22, 16, 15, 15, 23, 25, 25, 33, 60, 60, 29, 28, 14, 14, 13, 9, 11, 9, 7, 8, 11, 14, 8, 10, 14, 14, 15, 15, 18, 19, 22, 4, 5, 6, 4, 2, 9, 9, 5, 4, 7, 8, 6, 5, 6, 7, 7, 7, 7, 8, 6, 9, 9, 9, 10, 9, 18, 18, 18, 22, 18, 18, 20, 21, 9, 11, 10, 6, 8, 8, 11, 15, 11, 8, 5, 4, 23, 19, 21, 24, 20, 20, 18, 17, 13, 11, 12, 13, 25, 10, 5, 6, 6, 10, 6, 5, 7, 4, 6, 7, 6, 5, 4, 3, 5, 5, 13, 13, 60, 17, 17, 18, 18, 12, 11, 12, 20, 23, 29, 35, 33, 60, 19, 22, 24, 26, 29, 24, 23, 24, 24, 8, 8, 8, 13, 13, 12, 8, 7, 7, 12, 15, 6, 7, 22, 23, 22, 24, 27, 30, 27, 11, 10, 6, 8, 60, 21, 22, 22, 21, 22, 28, 25, 14, 13, 14, 14, 12, 13, 14, 17, 17, 19, 18, 21, 18, 17, 17, 17, 16, 14, 6, 5, 6, 11, 13, 11, 5, 5, 7, 7, 4, 5, 6, 7, 8, 13, 11, 10, 6, 8, 6, 3, 6, 3, 2, 2, 11, 15, 19, 19, 18, 19, 12, 21, 17, 15, 21, 15, 12, 7, 6, 5, 3, 3, 7, 8, 8, 10, 22, 25, 22, 24, 21, 22, 27, 43, 60, 60, 7, 9, 7, 29, 28, 20, 26, 23, 30, 32, 29, 25, 22, 31, 21, 22, 28, 27, 21, 21, 18, 14, 13, 12, 18, 12, 18, 6, 13, 20, 28, 26, 17, 10, 10, 26, 24, 25, 26, 24, 24, 5, 5, 7, 8, 6, 4, 9, 4, 4, 10, 10, 10, 11, 11, 14, 29, 31, 24, 30, 31, 39, 32, 33, 32, 18, 18, 19, 10, 8, 9, 5, 8, 10, 12, 13, 15, 14, 17, 20, 25, 22, 20, 15, 9, 9, 8, 7, 7, 8, 17, 18, 20, 23, 16, 16, 15, 15, 14, 15, 12, 14, 19, 19, 28, 23, 60, 17, 31, 18, 18, 18, 18, 16, 15, 16, 18, 18, 19, 17, 16, 19, 22, 22, 23, 22, 19, 18, 13, 10, 6, 7, 4, 4, 13, 19, 20, 27, 35, 27, 23, 25, 20, 14, 9, 21, 26, 24, 24, 19, 17, 9, 21, 15, 21, 17, 15, 21, 24, 7, 20, 23, 22, 21, 17, 17, 17, 21, 12, 17, 14, 60, 60, 28, 29, 6, 6, 6, 25, 21, 25, 13, 11, 18, 19, 19, 15, 24, 20, 12, 10, 11, 9, 8, 7, 8, 8, 7, 8, 7, 5, 5, 8, 12, 12, 11, 13, 9, 9, 11, 10, 13, 5, 4, 14, 22, 31, 29, 26, 17, 15, 16, 17, 19, 19, 16, 10, 9, 10, 8, 8, 21, 23, 16, 17, 18, 30, 28, 21, 21, 15, 12, 28, 5, 20, 21, 6, 8, 5, 20, 17, 17, 21, 23, 23, 27, 60, 16] + +def test_create_random_sequence(): + seed=1 + random_seq = process_fastq.create_random_sequence(50,1) + assert random_seq == "TAGACCCCTACACCACGTAGAAAACTCATCCTGTTCGACATGAGCTGGCC" + +def test_mutation(): + sequence = "ATCG" + mutated_sequence = process_fastq.damage_sequence(sequence, mutation_rate=1.0) + assert mutated_sequence != sequence + +def test_deletion(): + sequence = "ATCG" + deleted_sequence = process_fastq.damage_sequence(sequence, deletion_rate=1.0) + assert deleted_sequence != sequence + +def test_insertion(): + sequence = "ATCG" + inserted_sequence = process_fastq.damage_sequence(sequence, insertion_rate=0.98) + assert inserted_sequence != sequence \ No newline at end of file From a6a1eb926f0a4b0a9e6d88154e42267dd73ca231 Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Wed, 10 Apr 2024 18:20:28 +0200 Subject: [PATCH 2/9] unitests coded and docstrings formated --- test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_utils.py b/test_utils.py index cc5d326..8b85ef7 100644 --- a/test_utils.py +++ b/test_utils.py @@ -28,7 +28,7 @@ def test_quality_scores(): def test_create_random_sequence(): seed=1 - random_seq = process_fastq.create_random_sequence(50,1) + random_seq = process_fastq.create_random_sequence(50,seed) assert random_seq == "TAGACCCCTACACCACGTAGAAAACTCATCCTGTTCGACATGAGCTGGCC" def test_mutation(): From cdb0a990776adc919e20f004ea2f61e847f12786 Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Wed, 10 Apr 2024 18:32:42 +0200 Subject: [PATCH 3/9] docstrings formated for access_genbank.py --- access_genbank.py | 62 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/access_genbank.py b/access_genbank.py index 32ef2a8..88b1c6c 100644 --- a/access_genbank.py +++ b/access_genbank.py @@ -9,19 +9,18 @@ #------------------------------- def build_search_term(gene_name, min_len=0, max_len=-1): - """ generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name + """ + Generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name so a search query keywords must be used instead. Used to make database queries in extract_database and download_database - Parameters - ---------- - gene_name: (str) name of the gene of interest - min_len: (int, optional) lower bound of the sequence length filter, by default 0 - max_len: (int, optional) upper bound of the sequence length filter, by default -1 means no limit + Parameters: + gene_name(str): name of the gene of interest + min_len(int, optional): lower bound of the sequence length filter, by default 0 + max_len(int, optional): upper bound of the sequence length filter, by default -1 means no limit - Returns - ---------- - term: (str) search query + Returns: + term(str): search query """ term = f"{gene_name}[Gene Name]" if max_len == -1: @@ -32,17 +31,16 @@ def build_search_term(gene_name, min_len=0, max_len=-1): def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1): - '''Request data on GenBank from NCBI (with Entrez module), with filtering options + ''' + Request data on GenBank from NCBI (with Entrez module), with filtering options - Parameters - ---------- - gene_name: (str) name of the gene to extract from GenBank - seqlen_start: (int, optional) lower bound of the sequence length filter, by default 0 - seqlen_stop: (int, optional) upper bound of the sequence length filter, by default -1 means no limit + Parameters: + gene_name(str): name of the gene to extract from GenBank + seqlen_start(int, optional): lower bound of the sequence length filter, by default 0 + seqlen_stop(int, optional): upper bound of the sequence length filter, by default -1 means no limit - Returns - ---------- - seq_record: (str) raw data of all sequences + Returns: + seq_record(str): raw data of all sequences ''' handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop), retmax = 10000) record = Entrez.read(handle) @@ -60,15 +58,16 @@ def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1): def download_database(gene_name, seqlen_start=0, seqlen_stop=-1): - '''Download data to a .fasta file, with filtering options + ''' + Download data to a .fasta file, with filtering options Args: - gene_name: (str) name of the gene to extract from GenBank - seqlen_start: (int) lower bound of the sequence length filter - seqlen_stop: (int) upper bound of the sequence length filter + gene_name(str): name of the gene to extract from GenBank + seqlen_start(int): lower bound of the sequence length filter + seqlen_stop(int): upper bound of the sequence length filter Returns: - data_path: (str) path to downloaded data .fasta file + data_path(str): path to downloaded data .fasta file ''' data_path = f"{gene_name}[{seqlen_start},{seqlen_stop}].fasta" with open(data_path, mode="w") as file: @@ -77,17 +76,16 @@ def download_database(gene_name, seqlen_start=0, seqlen_stop=-1): def download_sequence(species, gene_name, dst, start_length=None, stop_length= None, id = None, permissive_search = True): """ - download sequence from GenBank through the Entrez database. + Download sequence from GenBank through the Entrez database. Parameters: - ---------- - species(str): name of species - gene_name(str): name of gene - dst(str,Path-like): destination file path - start_length(int): minimum length of sequence - stop_length(int): maximum length of sequence - id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species. - permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query. + species(str): name of species + gene_name(str): name of gene + dst(str,Path-like): destination file path + start_length(int): minimum length of sequence + stop_length(int): maximum length of sequence + id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species. + permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query. """ if id == None: From d9e1dd82e15acc6c693864b6ed03e6ea97a0a01d Mon Sep 17 00:00:00 2001 From: Ghassan Abboud Date: Tue, 2 Apr 2024 14:12:39 +0300 Subject: [PATCH 4/9] add quality_scores functions --- process_fastq.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/process_fastq.py b/process_fastq.py index 7b794a8..7f45a26 100644 --- a/process_fastq.py +++ b/process_fastq.py @@ -2,11 +2,11 @@ import numpy as np from Bio import SeqIO, Align import os.path as ospath + import os import gzip import shutil -import sys -import matplotlib.pyplot as plt +import sys import matplotlib.pyplot as plt #---------------------------- @@ -33,11 +33,8 @@ def extract_gz(src_file, dst_file): def read_fastq(fastq_filepath=None): """ Read a fastq file and return the reads. - - Args: - fastq_filepath (str): filepath of the .fastq file - Returns: - reads(list of Bio.SeqRecord.SeqRecord): reads from the fastq file + :param fastq_filepath: filepath of the .fastq file [str] + :return: reads from the fastq file [list of Bio.SeqRecord.SeqRecord] """ if fastq_filepath is None: fastq_filepath = "data/rbcL_Qiagen_tomato.fastq" # default path (example) if fastq_filepath.lower().endswith('.gz'): From 67163147926a56618a0e482845ea5b4cb7e76908 Mon Sep 17 00:00:00 2001 From: ghassanabboud Date: Sun, 7 Apr 2024 15:55:40 +0200 Subject: [PATCH 5/9] fix process_fastq import --- process_fastq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/process_fastq.py b/process_fastq.py index 7f45a26..48209c0 100644 --- a/process_fastq.py +++ b/process_fastq.py @@ -6,7 +6,8 @@ import os import gzip import shutil -import sys import matplotlib.pyplot as plt +import sys +import matplotlib.pyplot as plt #---------------------------- From c78b2aa39b21ca92861d06109cafefdd0e07389d Mon Sep 17 00:00:00 2001 From: ghassanabboud Date: Sun, 7 Apr 2024 16:10:06 +0200 Subject: [PATCH 6/9] update xstract_fastq --- process_fastq.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/process_fastq.py b/process_fastq.py index 48209c0..ec680cb 100644 --- a/process_fastq.py +++ b/process_fastq.py @@ -110,7 +110,7 @@ def get_file_iterator(filename): break -def extract_fastq(main_dir, sample_nb, dst): +def extract_fastq(main_dir, barcode_nb, dst): """ Extract all fastq files under a certain barcode/sample number from an expedition results folder. @@ -121,16 +121,23 @@ def extract_fastq(main_dir, sample_nb, dst): Returns: None """ + search_dirs=[] for root, dirs, files in os.walk(main_dir): if "fastq_pass" in dirs: - pass_dir = ospath.join(root, "fastq_pass") - break - for root, dirs, files in os.walk(pass_dir): - if f"barcode{sample_nb}" in dirs: - fastq_dir = ospath.join(root, f"barcode{sample_nb}") - elif f"barcode0{sample_nb}" in dirs: - fastq_dir = ospath.join(root, f"barcode0{sample_nb}") - concatenate_fastq(fastq_dir, dst) + search_dirs.append(ospath.join(root, "fastq_pass")) + if "barcoding" in dirs: + search_dirs.append(ospath.join(root, "barcoding")) + + for search_dir in search_dirs: + for root, dirs, files in os.walk(search_dir): + if f"barcode{barcode_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode{barcode_nb}") + print(fastq_dir) + concatenate_fastq(fastq_dir, dst) + elif f"barcode0{barcode_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode0{barcode_nb}") + print(fastq_dir) + concatenate_fastq(fastq_dir, dst) #---------------------------------- #GET AND VISUALIZE QUALITY SCORES From 8933b9a2bcbfee85261debd6a830b778c4447f7e Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Wed, 3 Apr 2024 14:42:34 +0200 Subject: [PATCH 7/9] added uniformed docstrings and pytests --- process_fastq.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/process_fastq.py b/process_fastq.py index ec680cb..c7fc7d6 100644 --- a/process_fastq.py +++ b/process_fastq.py @@ -34,8 +34,11 @@ def extract_gz(src_file, dst_file): def read_fastq(fastq_filepath=None): """ Read a fastq file and return the reads. - :param fastq_filepath: filepath of the .fastq file [str] - :return: reads from the fastq file [list of Bio.SeqRecord.SeqRecord] + + Args: + fastq_filepath (str): filepath of the .fastq file + Returns: + reads(list of Bio.SeqRecord.SeqRecord): reads from the fastq file """ if fastq_filepath is None: fastq_filepath = "data/rbcL_Qiagen_tomato.fastq" # default path (example) if fastq_filepath.lower().endswith('.gz'): @@ -110,7 +113,7 @@ def get_file_iterator(filename): break -def extract_fastq(main_dir, barcode_nb, dst): +def extract_fastq(main_dir, sample_nb, dst): """ Extract all fastq files under a certain barcode/sample number from an expedition results folder. @@ -121,23 +124,16 @@ def extract_fastq(main_dir, barcode_nb, dst): Returns: None """ - search_dirs=[] for root, dirs, files in os.walk(main_dir): if "fastq_pass" in dirs: - search_dirs.append(ospath.join(root, "fastq_pass")) - if "barcoding" in dirs: - search_dirs.append(ospath.join(root, "barcoding")) - - for search_dir in search_dirs: - for root, dirs, files in os.walk(search_dir): - if f"barcode{barcode_nb}" in dirs: - fastq_dir = ospath.join(root, f"barcode{barcode_nb}") - print(fastq_dir) - concatenate_fastq(fastq_dir, dst) - elif f"barcode0{barcode_nb}" in dirs: - fastq_dir = ospath.join(root, f"barcode0{barcode_nb}") - print(fastq_dir) - concatenate_fastq(fastq_dir, dst) + pass_dir = ospath.join(root, "fastq_pass") + break + for root, dirs, files in os.walk(pass_dir): + if f"barcode{sample_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode{sample_nb}") + elif f"barcode0{sample_nb}" in dirs: + fastq_dir = ospath.join(root, f"barcode0{sample_nb}") + concatenate_fastq(fastq_dir, dst) #---------------------------------- #GET AND VISUALIZE QUALITY SCORES From a23092f7b4d5f6e3ffd70703f9686a42acbabb9f Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Fri, 10 May 2024 11:49:58 +0200 Subject: [PATCH 8/9] added fake fastq generation to repo --- create_fake_dataset.py | 34 +++++++ generate_fastq.py | 214 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 create_fake_dataset.py create mode 100644 generate_fastq.py diff --git a/create_fake_dataset.py b/create_fake_dataset.py new file mode 100644 index 0000000..a343afe --- /dev/null +++ b/create_fake_dataset.py @@ -0,0 +1,34 @@ +from access_genbank import download_sequence_from_id +from generate_fastq import generate_fastq, multiplex +import os +import os.path as ospath +import argparse +from Bio import SeqIO + +def main(): + + destination_folder = "test_fake4" + ids= ["ON409953.1", "AF288129.1", "AF288126.1", "AF288125.1" ] + multiplexed = True + + + + destination_folder_path= ospath.join("data", destination_folder) + if not ospath.exists(destination_folder_path): + os.makedirs(destination_folder_path) + + record_parser = download_sequence_from_id(ids) + + for id, record in zip(ids,record_parser): + new_demultiplexed_fastq_path = ospath.join(destination_folder_path, str(id)+".fastq") + generate_fastq(record.seq, 100, new_demultiplexed_fastq_path) + print("left for loop") + if multiplexed: + multiplex_path=ospath.join(destination_folder_path,"multiplexed.fastq") + multiplex([destination_folder_path],multiplex_path) + + for record in SeqIO.parse(multiplex_path,"fastq"): + print(record) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/generate_fastq.py b/generate_fastq.py new file mode 100644 index 0000000..272665e --- /dev/null +++ b/generate_fastq.py @@ -0,0 +1,214 @@ +from Bio import SeqIO +import random +from Bio.Seq import Seq +import numpy as np +import os +import os.path as ospath +import numpy as np +import scipy.stats +import matplotlib.pyplot as plt + + +def generate_weighted_sequence_variant(sequence, weights=[0.25, 0.25, 0.25, 0.25]): + """ + Flips an initial sequence as it could happen when using the MinIon tool + + Arguments: + sequence (Seq): sequence of interest + weights(list of 4 float): probability of each sequence version (initial sequence, reverse complement, complement, reverse initial) + + Returns: + selected_variant(Seq): randomly reversed sequence + variant_description(str): string indicating which variant is selected + """ + variants = ["initial_sequence", "reverse_complement", "complement", "reverse_initial"] + variant_sequences = [sequence, sequence.reverse_complement(), sequence.complement(), sequence[::-1]] + + selected_variant_index = random.choices(range(len(variants)), weights=weights, k=1)[0] + selected_variant = variant_sequences[selected_variant_index] + variant_description = variants[selected_variant_index] + + return selected_variant, variant_description + + + +def break_prob_function(position, sequence_length): + """ + Example of probability function usable for the previous function + + Arguments: + position(int): position in sequence + sequence_length(int) + Returns: + probability at a specific position in sequence(float) + """ + max_prob = 0.5 # Max probability at start and end of sequence + return max_prob * (position / sequence_length) + +def bimodal_distribution(position,sequence_length): + """ + Probability density function of a bimodal normal distribution, highest chance is centered around positions at one or two thirds of sequence length + + Parameters + ---------- + position: position in the sequence for which probability of event is calculated + sequence_length: length of sequence + + Returns + --------- + probability of event, between 0 and 1 + """ + peaks = [sequence_length/3, 2*sequence_length/3] + return 0.5* scipy.stats.norm(peaks[0],sequence_length/15).pdf(position) + 0.5*scipy.stats.norm(peaks[1],sequence_length/15).pdf(position) + + +def break_sequence_with_probability(sequence, break_prob_function = bimodal_distribution, nbreaks = 2, take_longest= False): + """ + Simulates breakages in the sequence, which could happen using the MinIon tool + + Arguments: + sequence(Seq): sequence of interest + break_prob_funtion(function): probability function used as weight for the breakages + + Returns: + final_sequence (Seq): final sequence after breakages + break_info (dict): dictionary containing information about the breakages + """ + new_sequence=sequence + break_info = {'number_of_breaks': 0, 'part_taken': ''} + + for i, base in enumerate(sequence): + # Computes the breakage probability + break_prob = break_prob_function(i, len(sequence)) + + # Checks if sequence breaks at this iteration + if random.random() <= break_prob: + if nbreaks > 0: + new_sequence = new_sequence[:i+break_info["number_of_breaks"]] +"N" +new_sequence[i+break_info["number_of_breaks"]:] + nbreaks -= 1 # Breakage marker is 'N' + break_info['number_of_breaks'] += 1 + broken_parts = new_sequence.split("N") + if take_longest: + length_index = lambda index: len(broken_parts[index]) + final_seq_index = max(range(len(broken_parts)), key=length_index) + else: + final_seq_index = random.choice(range(len(broken_parts))) + + final_seq=broken_parts[final_seq_index] + break_info["part_taken"] = final_seq_index + + return final_seq, break_info + + +def mutate(base): + """ + Simulates substitution mutation + + Arguments: + base(char) + + Returns: + muted base (char) + """ + # Bases list + bases = ['A', 'T', 'C', 'G'] + if base not in bases: + raise ValueError("Invalid base provided. Base must be one of 'A', 'T', 'C', or 'G'.") + bases.remove(base) + # Randomly selected new base + return random.choice(bases) + + +def assign_quality_scores(sequence, mutation_probability=0.1,mutation_mean = 16,mutation_sd = 3,basic_mean = 48,basic_sd = 3): + """ + Assigns fake scores for each base, following a different normal law if base is mutated or not + + Arguments: + sequence(Seq): sequence of interest + mutation_probability(float) + mutation_mean(int): mean of normal law associated with mutation + mutation_sd(int): standard deviation of normal law associated with mutation + basic_mean(int): mean of normal law associated with no-change bases + mutation_mean(int): standard deviation of normal law associated with no-change bases + + Returns: + quality score of sequence (in base 64 characters -> ASCII) + nb_mutations(int): number of mutations + mutations_positions(array of ints): positions of mutations + """ + quality_scores = [] + base64_table = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~""" + nb_mutations=0 + mutations_positions= [] + for i, base in enumerate(sequence): + # Computes score for each base depending on mutation + if random.random() < mutation_probability: + # If mutation needed + base = mutate(base) + nb_mutations+=1 + mutations_positions.append(i+1) + quality_score = np.random.normal(mutation_mean, mutation_sd) + else: + # If no mutation needed + quality_score = np.random.normal(basic_mean, basic_sd) + # Limits the score between 0 and 93 (ASCII go from 33 to 126) + quality_score = max(min(quality_score, 93), 0) + base64_score = base64_table[int(round(quality_score))] + quality_scores.append(base64_score) + return ''.join(quality_scores), nb_mutations, mutations_positions + +def generate_fastq(input_seq, n_reads, dst, variants_weights= [0.25, 0.25,0.25,0.25], break_prob_function=break_prob_function, mutation_prob =0.1, quality_score_params= [10,2,50,5], nbreaks=2, take_longest=False): + + with open(dst, "a") as handle: # Operations on newly created or updated file + for i in range(n_reads) : + seq, orientation = generate_weighted_sequence_variant(input_seq,variants_weights) + broken_seq, break_info = break_sequence_with_probability(seq,break_prob_function, nbreaks,take_longest) + quality_string, n_mutations,mutations_positions = assign_quality_scores(broken_seq,mutation_prob,quality_score_params[0],quality_score_params[1],quality_score_params[2],quality_score_params[3]) + + variant_str = "Orientation_of_sequence="+orientation+" " + breakage_str = "Number_of_breakages="+str(break_info['number_of_breaks'])+" Part_taken="+str(break_info['part_taken'])+" " + mutations_str = "Number_of_mutations="+str(n_mutations)+" Positions="+str(mutations_positions) + + handle.write(f"@read{i} "+variant_str+breakage_str+mutations_str+'\n') #writes the sequence id + handle.write(str(broken_seq)+ '\n') #writes the sequence generated by transforming it into string + handle.write("+\n") #writes the + separator + handle.write(quality_string + '\n') #writes the quality score + +def multiplex(folder_paths: list, dst: str): + """ + merge multiple fastq files into one, while writing in the description the origin of each read + + Parameters + ---------- + folder_paths: list of folder paths to look for fastq files for merging + dst: path of the output merged fastq file + + """ + parsers=dict() + for folder_path in folder_paths: + for root,_,files in os.walk(folder_path): + for file in files: + if file.endswith(".fastq"): + file_path = ospath.join(root,file) + parsers[file_path] = list(SeqIO.parse(file_path, "fastq")) + + with open(dst, "a") as writer: + print("inside writing") + while parsers: + parent_file_path = random.choice(list(parsers.keys())) + current_list=parsers[parent_file_path] + index= random.randint(0, len(current_list)-1) + seqrecord = current_list[index] + seqrecord.description = seqrecord.description + " source=" + parent_file_path + SeqIO.write(seqrecord, writer, "fastq") + current_list.pop(index) + print("index popped") + if not current_list: + print("list popped") + parsers.pop(parent_file_path) + + + + + + From c2b45795173da99cbeae694ecf656f01db6174ce Mon Sep 17 00:00:00 2001 From: ykeren6 Date: Fri, 10 May 2024 11:57:03 +0200 Subject: [PATCH 9/9] added fake fastq generation to repo --- access_genbank.py | 83 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/access_genbank.py b/access_genbank.py index 88b1c6c..d045394 100644 --- a/access_genbank.py +++ b/access_genbank.py @@ -74,43 +74,78 @@ def download_database(gene_name, seqlen_start=0, seqlen_stop=-1): file.write(extract_database(gene_name, seqlen_start, seqlen_stop)) return data_path -def download_sequence(species, gene_name, dst, start_length=None, stop_length= None, id = None, permissive_search = True): +def download_sequence_from_id(id: list, dst = None): + """ + download sequence from GenBank through the Entrez database using unique identifiers. + + Args: + id (list): list of IDs of sequences to download from GenBank + dst(str): destination file path + + Returns: + Bio.SeqIO.FastaIO.FastaIterator: iterator over the records corresponding to the input ids + + """ + handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta") + record_parser = SeqIO.parse(handle, "fasta") + if dst != None: + with open(dst, mode="a") as writer: + SeqIO.write(record_parser,writer,"fasta") + return SeqIO.parse(dst,"fasta") + return record_parser + +def get_species_from_ids(ids:list): + """ + Return the species from GenBank entries with certain ids. + + Parameters: + ids: list of ids to search + + Returns: + dictionary that matches ids to organism names + + """ + species_dict= dict() + for id in ids: + handle = Entrez.efetch(db="nucleotide", id=id, retmode = "gb", rettype = "gb") + records = SeqIO.parse(handle,"genbank") + one_record = next(records) + species_dict[id]=one_record.annotations["organism"] + return species_dict + +def download_sequence(species, gene_name, dst = None, start_length=None, stop_length= None, permissive_search = True): """ Download sequence from GenBank through the Entrez database. Parameters: species(str): name of species gene_name(str): name of gene - dst(str,Path-like): destination file path + dst(str,Path-like): destination file path, if None does not write to a file, only returns start_length(int): minimum length of sequence stop_length(int): maximum length of sequence id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species. permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query. + + Returns: + Bio.SeqRecord.SeqRecord: sequence found from GenBank + """ - if id == None: - search_term = f"{gene_name}[Gene Name] AND {species}[Organism]" - if start_length!= None or stop_length!= None: - search_term += f" {start_length}:{stop_length}[Sequence Length]" + + search_term = f"{gene_name}[Gene Name] AND {species}[Organism]" + if start_length!= None or stop_length!= None: + search_term += f" {start_length}:{stop_length}[Sequence Length]" + handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc") + search_result = Entrez.read(handle) + handle.close() + id = search_result["IdList"] + if len(id)==0 and permissive_search: + search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]" handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc") search_result = Entrez.read(handle) handle.close() id = search_result["IdList"] - n=0 - for i in id: - n+=1 - if n==0 and permissive_search: - search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]" - handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc") - search_result = Entrez.read(handle) - handle.close() - id = search_result["IdList"] - n=0 - for i in id: - n+=1 - if n==1: - handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta") - sequence = handle.read() - handle.close() - with open(dst, mode="a") as writer: - writer.write(sequence) \ No newline at end of file + if len(id)==1: + return next(download_sequence_from_id(id,dst)) + else: + raise ValueError("Your search did not return any results")