From 86817442599b1ac423845fcfb52cfa3fc04ef05c Mon Sep 17 00:00:00 2001 From: Jill Bilodeaux Date: Thu, 6 Feb 2025 12:35:59 -0700 Subject: [PATCH 01/17] seq2structure functions and test data --- seq2struct/seq2struct.py | 57 ++++ seq2struct/seq2struct_run.py | 52 ++++ seq2struct/standardize_afa.py | 59 ++++ seq2struct/test_data/ecoli-t4-trna.afa | 285 ++++++++++++++++++ seq2struct/test_data/ecoli-t4-trna.fasta.gz | Bin 0 -> 3213 bytes seq2struct/test_data/ecoli-t4-trna.sto | 292 +++++++++++++++++++ seq2struct/test_data/test.afa | 285 ++++++++++++++++++ seq2struct/test_data/test.tsv.gz | Bin 0 -> 322162 bytes seq2struct/test_result/test-clean-header.afa | 190 ++++++++++++ 9 files changed, 1220 insertions(+) create mode 100644 seq2struct/seq2struct.py create mode 100644 seq2struct/seq2struct_run.py create mode 100644 seq2struct/standardize_afa.py create mode 100644 seq2struct/test_data/ecoli-t4-trna.afa create mode 100644 seq2struct/test_data/ecoli-t4-trna.fasta.gz create mode 100644 seq2struct/test_data/ecoli-t4-trna.sto create mode 100644 seq2struct/test_data/test.afa create mode 100644 seq2struct/test_data/test.tsv.gz create mode 100644 seq2struct/test_result/test-clean-header.afa diff --git a/seq2struct/seq2struct.py b/seq2struct/seq2struct.py new file mode 100644 index 0000000..09dd04d --- /dev/null +++ b/seq2struct/seq2struct.py @@ -0,0 +1,57 @@ +import csv +import argparse + +class Seq2StructTSV: + def __init__(self, fasta_file, fsa_file, output_file): + self.fasta_file = fasta_file + self.fsa_file = fsa_file + self.output_file = output_file + + def read_fasta(self, file_name): + sequences, headers = {}, [] + with open(file_name, 'r') as file: + seq_id = None + for line in file: + line = line.strip() + if line.startswith('>'): + seq_id = line[1:] + headers.append(seq_id) + sequences[seq_id] = '' + else: + sequences[seq_id] += line.upper() + return sequences, headers + + def create_mapping(self, ref_seq, ann_seq): + mapping, ref_index = {}, 0 + for ann_index, ann_nuc in enumerate(ann_seq): + mapping[ann_index + 1] = ref_index + 1 if ann_nuc != '-' else None + if ann_nuc != '-': + ref_index += 1 + return mapping + + def process(self): + ref_seqs, _ = self.read_fasta(self.fasta_file) + ann_seqs, ann_headers = self.read_fasta(self.fsa_file) + with open(self.output_file, 'w', newline='') as file: + writer = csv.writer(file, delimiter='\t') + writer.writerow(['seq_ref', 'struct_ref', 'struct_nt', 'struct_pos', 'seq_pos']) + for ann_id in ann_headers: + parts = ann_id.split('-') + prefix, amino_acid, anticodon_fsa = parts[0], parts[2], parts[3].replace('T', 'U') + matching_refs = [ref_id for ref_id in ref_seqs if ref_id.startswith(f'{prefix}-tRNA-{amino_acid}-{anticodon_fsa}')] + for ref_id in matching_refs: + mapping = self.create_mapping(ref_seqs[ref_id], ann_seqs[ann_id]) + for ann_index, ann_nuc in enumerate(ann_seqs[ann_id]): + writer.writerow([ref_id, ann_id, ann_nuc, ann_index + 1, mapping.get(ann_index + 1, '')]) + +# Argument Parsing +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate Sequence-to-Structure TSV") + parser.add_argument("fasta_file", type=str, help="Path to reference FASTA file") + parser.add_argument("fsa_file", type=str, help="Path to annotated FSA file") + parser.add_argument("output_file", type=str, help="Path to output TSV file") + args = parser.parse_args() + + processor = Seq2StructTSV(args.fasta_file, args.fsa_file, args.output_file) + processor.process() + print(f"Sequence-to-Structure TSV saved to {args.output_file}") \ No newline at end of file diff --git a/seq2struct/seq2struct_run.py b/seq2struct/seq2struct_run.py new file mode 100644 index 0000000..b1d2ff8 --- /dev/null +++ b/seq2struct/seq2struct_run.py @@ -0,0 +1,52 @@ +import subprocess +import sys +""" +Currently, this is two classes that provide structure to making the seq to structure mapping file. +The function of this driver script: +1. takes an afa file from Modomics with annotated modification sites for various organisms and provides a nucleotide readible in ATGC. +2. Stockholm files of the reference tRNAs are converted to afa files prior to this script. +these reference files are anchored to a consensus structure. This script will take these anchored reference sequences and +provide a structural position to each nucleotide. +3. then both the mod.afa and ref.afa will be used to give each nucleotide in the fasta reference a structural position and modification annotation (if known) + + +Workflow: +1. take a stockholm file reformated to afa and define special characters(?) +2. count non-gap / space positions with corresponding nucleotide +3. save this consensus secondary structure anchored mapping in a new file +4. join the reference with the map to generate a sequence to structure file that can be used for nanopore sequencing conversion of sequence space to structure space. + +File inputs: +ref.afa -> reference file anchored to consensus secondary structure converted from .sto to .afa (esl-reformat) +mod.afa -> modomics afa file that contains modification annotations across multiple species (could just start with ecoli? Could get rid of entirely) +ref.fasta -> fasta reference that contains 5' and 3' sequencing adapters + +Outputs: +standardized_afa: a afa file that has standardized header format, sequencing adapters added to each sequence, and modification annotations +seq2struct_tsv: map of the secondary structure to reference file with modification annotation at relevant nucleotides. + +""" +'---------------------------------------------------------------' + +def main(): + if len(sys.argv) < 2: + print("Usage: python main.py