From 246fa5c0dd7a03b76d1a9aa66e31a7a5188c7e14 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:22:17 +0100
Subject: [PATCH 01/30] Refactoring with Clap enums

---
 Cargo.lock                       |  19 ++
 Cargo.toml                       |   2 +-
 src/cli/mod.rs                   | 184 ++++++++++++++++
 src/exclude_seq.rs               |  44 ----
 src/main.rs                      | 353 ++++---------------------------
 src/map_headers.rs               | 142 -------------
 src/processors.rs                |   7 +
 src/processors/exclude_seq.rs    |  40 ++++
 src/processors/map_headers.rs    | 131 ++++++++++++
 src/processors/remap_head.rs     |  66 ++++++
 src/processors/split_by_count.rs | 100 +++++++++
 src/processors/split_by_size.rs  |  10 +
 src/processors/tpf_fasta.rs      | 272 ++++++++++++++++++++++++
 src/processors/yaml_validator.rs | 265 +++++++++++++++++++++++
 src/remap_head.rs                |  73 -------
 src/split_by_count.rs            | 111 ----------
 src/split_by_size.rs             |  12 --
 src/tpf_fasta.rs                 | 277 ------------------------
 src/yaml_validator.rs            | 272 ------------------------
 19 files changed, 1141 insertions(+), 1239 deletions(-)
 create mode 100644 src/cli/mod.rs
 delete mode 100644 src/exclude_seq.rs
 delete mode 100644 src/map_headers.rs
 create mode 100644 src/processors.rs
 create mode 100644 src/processors/exclude_seq.rs
 create mode 100644 src/processors/map_headers.rs
 create mode 100644 src/processors/remap_head.rs
 create mode 100644 src/processors/split_by_count.rs
 create mode 100644 src/processors/split_by_size.rs
 create mode 100644 src/processors/tpf_fasta.rs
 create mode 100644 src/processors/yaml_validator.rs
 delete mode 100644 src/remap_head.rs
 delete mode 100644 src/split_by_count.rs
 delete mode 100644 src/split_by_size.rs
 delete mode 100644 src/tpf_fasta.rs
 delete mode 100644 src/yaml_validator.rs
diff --git a/Cargo.lock b/Cargo.lock
index 7630133..a09d981 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -199,6 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136"
 dependencies = [
  "clap_builder",
+ "clap_derive",
 ]
 
 [[package]]
@@ -213,6 +214,18 @@ dependencies = [
  "strsim",
 ]
 
+[[package]]
+name = "clap_derive"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.60",
+]
+
 [[package]]
 name = "clap_lex"
 version = "0.5.1"
@@ -626,6 +639,12 @@ dependencies = [
  "allocator-api2",
 ]
 
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.3"
diff --git a/Cargo.toml b/Cargo.toml
index 4c55605..b6f6872 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-clap = { version = "4.4.4", features = ["cargo"] }
+clap = { version = "4.4.4", features = ["cargo", "derive"] }
 colored = "2.0.4"
 compare = "0.1.0"
 csv = "1.3.0"
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
new file mode 100644
index 0000000..74d50ef
--- /dev/null
+++ b/src/cli/mod.rs
@@ -0,0 +1,184 @@
+use clap::{Parser, Subcommand};
+use clap::builder::Str;
+
+const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"];
+
+// CLI for Fasta Processing
+#[derive(Parser)]
+#[command(version="v1.0.0", about, long_about = None)]
+pub struct Cli {
+    // command is optional (TODO: Make this not optional)
+    // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html#defaults
+    #[command(subcommand)]
+    pub command: Option<Commands>
+}
+
+// Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html
+#[derive(Subcommand)]
+pub enum Commands {
+    YamlValidator {
+        // Path to the TreeVal yaml file generated by the user
+        #[arg(short, long)]
+        yaml: String,
+
+        // Print explainers as to why validation fails, if it does fail
+        #[arg(short = 'v', long)]
+        verbose: bool,
+
+        // Output the log to file
+        #[arg(short = 'o', long, default_value_t=String::from("./"))]
+        output: String
+    },
+
+    SplitByCount {
+
+        // A path to a valid fasta file.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa
+        #[arg(short = 'o', long, default_value_t = String::from("./"))]
+        output_directory: String,
+
+        // The data type of the input data
+        #[arg(short = 'd', value_parser = clap::builder::PossibleValuesParser::new(SPLIT_OPTIONS))]
+        data_type: String ,
+
+        // Do we need to sanitise the headers of the input fasta
+        #[arg(short = 's', value_parser = clap::value_parser!(bool))]
+        sanitise: bool,
+
+        // How many sequences per file
+        #[arg(short = 'c', value_parser = clap::value_parser!(u16))]
+        count: u16,
+    },
+
+    SplitBySize {
+        // A path to a valid fasta file.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // Size in MB that a fasta file is to be chunked into
+        #[arg(short = 's', long = "mem-size")]
+        mem_size: u16,
+
+        // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa
+        #[arg(short = 'o', long, default_value_t = String::from("./"))]
+        output_directory: String,
+    },
+
+    GenesetCSVS {
+        // The path to the top level directory of your geneset directory.
+        #[arg(short = 'd')]
+        geneset_dir: String,
+
+        // Specify the clade folder to refresh
+        #[arg(short = 'c', default_value_t = String::from("ALL"))]
+        specifiy_clade: String,
+    },
+
+    MapHeaders {
+        // A path to a valid fasta file.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta
+        #[arg(short = 'o', long, default_value_t = String::from("./"))]
+        output_directory: String,
+
+        #[arg(short = 'r', default_value_t = String::from("FMMH"))]
+        replace_with: String
+    },
+
+    ReMapHeaders {
+        // A path to a valid fasta file.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta
+        #[arg(short = 'o', long, default_value_t = String::from("./new"))]
+        output_directory: String,
+
+        // "The original mapped header field, a TSV of old-header, new-header
+        #[arg(short = 'm', default_value_t = String::from("FMMH"))]
+        map_file: String
+    },
+
+    #[command(version, about="Profile an input fasta file and return various statistics", long_about = None)]
+    Profile {
+        // A path to a valid fasta file.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // The input fasta file for profiling
+        #[arg(short = 'o', long, default_value_t = String::from("FasMan-out"))]
+        output_dir: String
+    },
+
+    Curate {
+        // The input fasta file for re-organising
+        #[arg(short = 'f', long)]
+        fasta: String,
+
+        // The TPF file used to re-organise the input fasta
+        #[arg(short = 't', long)]
+        tpf: String,
+
+        // Size sort the output or leave as order in AGP
+        #[arg(short = 's')]
+        sort: bool,
+
+        #[arg(short = 'o', default_value_t = String::from("new.fasta"))]
+        output: String,
+
+        // Length that the N (gap) string should be.
+        #[arg(short, long, default_value_t = 200)]
+        n_length: usize
+    },
+
+    Subset {
+        // A path to a valid fasta file for profiling.
+        #[arg(short = 'f', long)]
+        fasta_file: String,
+
+        // Random subset of input file. Default skims the first X given percent
+        #[arg(short = 'r', long)]
+        random: bool,
+
+        // Percentage of the original file entries that should be retained
+        #[arg(short = 'p', long, default_value_t = 50)]
+        percent: u16
+    },
+
+    FilterFasta {
+        // A fasta file for processing.
+        #[arg(short = 'f', long)]
+        fasta: String,
+
+        // The outfile naming
+        #[arg(short = 'o', default_value_t = String::from("FilteredFasta.fa"))]
+        output: String,
+
+        #[arg(short = 'l', long = "filter_list")]
+        filter_list: String
+    },
+
+    Mergehaps {
+
+        // The input fasta file for re-organising
+        #[arg(short = 'p', long)]
+        fasta_1: String,
+
+        // The second input fasta file
+        #[arg(short = 's', long)]
+        fasta_2: String,
+
+        // TA '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output
+        #[arg(short = 's', long, default_value_t = String::from("PRI/HAP"))]
+        naming: String,
+
+        // Output file prefix
+        #[arg(short = 'o', default_value_t = String::from("merged"))]
+        output: String,
+    }
+}
\ No newline at end of file
diff --git a/src/exclude_seq.rs b/src/exclude_seq.rs
deleted file mode 100644
index ab82c4e..0000000
--- a/src/exclude_seq.rs
+++ /dev/null
@@ -1,44 +0,0 @@
-pub mod exclude_seq_mod {
-    use clap::ArgMatches;
-    use noodles::fasta;
-    use std::error::Error;
-    use std::{fs, io::BufRead, str};
-
-    fn open_fasta<'a>(
-        exclusions: Vec<&str>,
-        fasta: &'a str,
-        out_file: &str,
-    ) -> std::result::Result<&'a str, Box<dyn Error>> {
-        let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
-            fasta::reader::Builder.build_from_path(fasta);
-        let file = fs::OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(out_file)?;
-        let mut writer = fasta::Writer::new(file);
-
-        match reader {
-            Ok(fasta) => {
-                let mut binding = fasta;
-                for result in binding.records() {
-                    let record = result?;
-                    if !exclusions.contains(&record.name()) {
-                        writer.write_record(&record)?;
-                    } else {
-                        println!("Found record to exclude: {:?}", &record.name());
-                    }
-                }
-                Ok("Removed Exclusionary List")
-            }
-            Err(_) => Err("Error: Fasta is not valid check file!".into()),
-        }
-    }
-
-    pub fn filter_fasta(arguments: std::option::Option<&ArgMatches>) {
-        let fasta = arguments.unwrap().get_one::<String>("fasta").unwrap();
-        let exclude = arguments.unwrap().get_one::<String>("filter_list").unwrap();
-        let outfile = arguments.unwrap().get_one::<String>("output").unwrap();
-        let list_to_exclude = exclude.split(',').collect::<Vec<&str>>();
-        let _x = open_fasta(list_to_exclude, fasta, outfile);
-    }
-}
diff --git a/src/main.rs b/src/main.rs
index 9e947f9..0f635ac 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,327 +1,66 @@
 #![allow(non_snake_case)]
 
-use clap::{command, Arg, Command};
+use clap::{command, Arg, Command, Parser};
 use colored::Colorize;
 use std::env;
 use std::io::Error;
 
-mod yaml_validator;
-use crate::yaml_validator::yaml_validator_mod::validate_yaml;
-
-mod map_headers;
-use crate::map_headers::mapping_headers::map_fasta_head;
-
-mod remap_head;
-use crate::remap_head::remapping_headers::remapping_head;
-
-mod split_by_size;
-use crate::split_by_size::split_by_size_mod::split_file_by_size;
-
-mod split_by_count;
-use crate::split_by_count::split_by_count_mod::split_file_by_count;
+mod cli;
+use cli::{Cli, Commands};
 
 mod generics;
 //use crate::generics::validate_fasta;
 
-mod tpf_fasta;
-use crate::tpf_fasta::tpf_fasta_mod::curate_fasta;
+// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html
+use crate::processors::yaml_validator::validate_yaml;
+use crate::processors::split_by_count::split_file_by_count;
+use crate::processors::split_by_size::split_file_by_size;
+use crate::processors::map_headers::map_fasta_head;
+use crate::processors::remap_head::remapping_head;
+use crate::processors::tpf_fasta::curate_fasta;
+use crate::processors::exclude_seq::filter_fasta;
+mod processors;
 
-mod exclude_seq;
-use crate::exclude_seq::exclude_seq_mod::filter_fasta;
 
 fn main() -> Result<(), Error> {
-    let split_options = ["pep", "cds", "cdna", "rna", "other"];
-    let match_result = command!()
-    .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project")
-    .subcommand(
-        Command::new("validateyaml")
-            .about("Subcommand for validating the users TreeVal yaml file")
-            .arg(
-                Arg::new("yaml")
-                    .required(true)
-                    .help("Path to the TreeVal yaml file generated by the user")
-            )
-            .arg(
-                Arg::new("verbose")
-                    .short('v')
-                    .value_parser(clap::value_parser!(bool))
-                    .default_value("false")
-                    .help("Print explainers as to why validation fails, if it does fail")
-            )
-            .arg(
-                Arg::new("output")
-                    .short('o')
-                    .default_value("./")
-                    .help("Output the log to file")
-            )
-    )
-    .subcommand(
-        Command::new("splitbycount")
-            .about("Subcommand for splitting fasta files by number of sequence-header pairs, e.g., 100 pairs per file")
-            .arg(
-                Arg::new("fasta-file")
-                    .short('f')
-                    .required(true)
-                    .help("A path to a valid fasta file.")
-            )
-            .arg(
-                Arg::new("output-directory")
-                    .short('o')
-                    .default_value("./")
-                    .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa")
-            )
-            .arg(
-                Arg::new("data_type")
-                    .short('d')
-                    .value_parser(clap::builder::PossibleValuesParser::new(split_options))
-                    .help("The data type of the input data")
-            )
-            .arg(
-                Arg::new("sanitise")
-                    .short('s')
-                    .value_parser(clap::value_parser!(bool))
-                    .help("Do we need to sanitise the headers of the input fasta")
-            )
-            .arg(
-                Arg::new("count")
-                    .short('c')
-                    .value_parser(clap::value_parser!(u16))
-                    .help("How many sequences per file")
-            )
-    )
-    .subcommand(
-        Command::new("splitbysize")
-            .about("Subcommand for splitting fasta files by user given size (in MegaBytes) into n (fasta_size / user_given_size) files")
-            .arg(
-                Arg::new("fasta-file")
-                    .short('f')
-                    .required(true)
-                    .help("A path to a valid fasta file.")
-            )
-            .arg(
-                Arg::new("mem-size")
-                    .short('s')
-                    .required(true)
-                    .value_parser(clap::value_parser!(u16))
-                    .help("Size in MB that a fasta file is to be chunked into")
-            )
-            .arg(
-                Arg::new("output-directory")
-                    .short('o')
-                    .default_value("./")
-                    .help("The output directory that files will be placed in")
-            )
-    )
-    .subcommand(
-        Command::new("geneset_csvs")
-            .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal")
-            .arg(
-                Arg::new("geneset_dir")
-                    .short('d')
-                    .required(true)
-                    .help("The path to the top level directory of your geneset directory.")
-            )
-            .arg(
-                Arg::new("specifiy_clade")
-                    .short('c')
-                    .required(true)
-                    .default_value("ALL")
-                    .help("Specify the clade folder to refresh")
-            )
-    )
-    .subcommand(
-        Command::new("mapheaders")
-            .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers")
-            .arg(
-                Arg::new("fasta-file")
-                    .short('f')
-                    .required(true)
-                    .help("A path to a valid fasta file.")
-            )
-            .arg(
-                Arg::new("output-directory")
-                    .short('o')
-                    .default_value("./")
-                    .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
-            )
-            .arg(
-                Arg::new("replace-with")
-                    .short('r')
-                    .default_value("FMMH")
-                    .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'")
-            )
-    )
-    .subcommand(
-        Command::new("remapheaders")
-            .about("Subcommand for stripping out previously mapped headers and replacing with the old headers")
-            .arg(
-                Arg::new("fasta-file")
-                    .short('f')
-                    .required(true)
-                    .help("A path to a valid fasta file.")
-            )
-            .arg(
-                Arg::new("output-directory")
-                    .short('o')
-                    .default_value("./new")
-                    .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
-            )
-            .arg(
-                Arg::new("map-file")
-                    .short('m')
-                    .required(true)
-                    .help("The original mapped header field, a TSV of old-header, new-header")
-            )
-    )
-    .subcommand(
-        Command::new("profile")
-        .about("Profile an input fasta file and return various statistics")
-        .arg(
-            Arg::new("fasta-file")
-                .short('f')
-                .required(true)
-                .help("The input fasta file for profiling")
-        )
-        .arg(
-            Arg::new("output-dir")
-                .short('o')
-                .default_value("FasMan-out")
-                .help("The input fasta file for profiling")
-        )
-    )
-    .subcommand(
-        Command::new("curate")
-        .about("Convert an tpf file and original fasta file into a fasta file - useful for curation")
-        .arg(
-            Arg::new("fasta")
-                .short('f')
-                .required(true)
-                .help("The input fasta file for re-organising")
-        )
-        .arg(
-            Arg::new("tpf")
-                .short('t')
-                .required(true)
-                .help("The TPF file used to re-organise the input fasta")
-        )
-        .arg(
-            Arg::new("sort")
-                .short('s')
-                .value_parser(clap::value_parser!(bool))
-                .default_value("false")
-                .help("Size sort the output or leave as order in AGP")
-        )
-        .arg(
-            Arg::new("output")
-                .short('o')
-                .default_value("new.fasta")
-                .help("The output name of the new fasta file")
-        )
-        .arg(
-            Arg::new("n_length")
-                .value_parser(clap::value_parser!(usize))
-                .default_value("200")
-                .help("Length that the N (gap) string should be.")
-        )
-    )
-    .subcommand(
-        Command::new("subset")
-        .about("Subset a fasta file in a random manner by percentage of file")
-        .arg(
-            Arg::new("fasta-file")
-                .short('f')
-                .required(true)
-                .help("The input fasta file for profiling")
-        )
-        .arg(
-            Arg::new("random")
-                .short('r')
-                .value_parser(clap::value_parser!(bool))
-                .help("Random subset of input file. Default skims the first X given percent")
-        )
-        .arg(
-            Arg::new("percent")
-                .short('p')
-                .value_parser(clap::value_parser!(u16))
-                .default_value("50")
-                .help("Percentage of the original file entries that should be retained")
-        )
-    )
-    .subcommand(
-        Command::new("filterfasta")
-            .about("Filter a given list of sequences from fasta file")
-            .arg(
-                Arg::new("fasta")
-                    .short('f')
-                    .required(true)
-                    .help("A fasta file for processing")
-            )
-            .arg(
-                Arg::new("output")
-                    .short('o')
-                    .default_value("FiilteredFasta.fa")
-                    .help("The outfile naming")
-            )
-            .arg(
-                Arg::new("filter_list")
-                    .short('l')
-                    .help("A string comma-separated list of sequence names to exclude from the final fasta")
-            )
-    )
-    .subcommand(
-        Command::new("mergehaps")
-        .about("Merge haplotypes / multi fasta files together")
-        .arg(
-            Arg::new("fasta-1")
-                .short('p')
-                .required(true)
-                .help("The input fasta file for re-organising")
-        )
-        .arg(
-            Arg::new("fasta-2")
-                .short('s')
-                .required(true)
-                .help("The second input fasta file")
-        )
-        .arg(
-            Arg::new("naming")
-                .short('s')
-                .default_value("PRI/HAP")
-                .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output")
-        )
-        .arg(
-            Arg::new("output")
-                .short('o')
-                .default_value("merged")
-                .help("Output file prefix")
-        )
-    )
-    .get_matches();
 
-    println! {
-        "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}",
-        "WELCOME TO Fasta Manipulator".bold(),
-        "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(),
-        "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(),
-        match_result.subcommand_name().unwrap(),
-        env::consts::OS
-    };
+    let cli = Cli::parse();
 
-    match match_result.subcommand_name() {
-        Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")),
-        Some("splitbycount") => {
-            split_file_by_count(match_result.subcommand_matches("splitbycount"))
+    match &cli.command {
+        Some(Commands::YamlValidator { yaml, verbose, output }) => {
+            validate_yaml(yaml, verbose, output)
+        },
+        Some(Commands::SplitByCount { fasta_file, output_directory, data_type, sanitise, count}) => {
+            split_file_by_count(
+                fasta_file, output_directory, data_type, sanitise, count
+            )
+        },
+        Some(Commands::SplitBySize { fasta_file, mem_size, output_directory }) => {
+            split_file_by_size(
+                fasta_file,
+                mem_size,
+                output_directory
+            )
+        },
+        Some(Commands::MapHeaders { fasta_file, output_directory, replace_with }) => {
+            _ = map_fasta_head(fasta_file, output_directory, replace_with)
+        },
+        Some(Commands::ReMapHeaders { fasta_file, output_directory, map_file }) => {
+            remapping_head(fasta_file, output_directory, map_file)
+        }
+        Some(Commands::Curate { fasta, tpf, sort, output, n_length}) => {
+            curate_fasta(fasta, tpf, sort, output, n_length)
         }
-        Some("mapheaders") => {
-            _ = map_fasta_head(match_result.subcommand_matches("mapheaders"));
+        Some(Commands::FilterFasta { fasta, output, filter_list }) => {
+            filter_fasta(fasta, output, filter_list)
         }
-        Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")),
-        Some("remapheaders") => remapping_head(match_result.subcommand_matches("remapheaders")),
-        Some("curate") => curate_fasta(match_result.subcommand_matches("curate")),
-        Some("filterfasta") => filter_fasta(match_result.subcommand_matches("filterfasta")),
-        _ => {
-            unreachable!()
+        Some(Commands::GenesetCSVS { .. }) => { todo!() },
+        Some(Commands::Profile { .. }) => { todo!() }
+        Some(Commands::Subset { .. }) => { todo!() }
+        Some(Commands::Mergehaps { .. }) => { todo!() }
+        None => {
+            panic!("No command given!")
         }
-    };
+    }
     Ok(())
 }
diff --git a/src/map_headers.rs b/src/map_headers.rs
deleted file mode 100644
index 2b066b0..0000000
--- a/src/map_headers.rs
+++ /dev/null
@@ -1,142 +0,0 @@
-pub mod mapping_headers {
-
-    use clap::ArgMatches;
-    use colored::Colorize;
-    use std::error::Error;
-    use std::fmt;
-    use std::fs::File;
-    use std::io::{BufRead, BufReader, BufWriter, Write};
-    use std::iter::Zip;
-
-    use crate::generics::only_keys;
-    use crate::generics::validate_fasta;
-
-    #[allow(dead_code)]
-    #[derive(Debug, Clone)]
-    struct EmptyVec;
-    impl Error for EmptyVec {}
-
-    impl fmt::Display for EmptyVec {
-        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-            write!(f, "Can't Display Empty Vec")
-        }
-    }
-
-    #[allow(clippy::explicit_counter_loop)]
-    pub fn create_mapping(
-        name_vec: Vec<std::string::String>,
-        new_name: &str,
-    ) -> Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>> {
-        // Generate a new mapping for the Fasta
-        //
-        let mut new_heads: Vec<String> = Vec::new();
-        let mut head_counter: i32 = 0;
-        let name_vec_clone = name_vec.clone();
-
-        for _x in name_vec {
-            new_heads.push(format!("{}_{}", new_name, head_counter));
-            head_counter += 1;
-        }
-
-        let mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-            name_vec_clone.into_iter().zip(new_heads);
-
-        mapped_heads
-    }
-
-    pub fn save_mapping(
-        output: &str,
-        mapped: Zip<
-            std::vec::IntoIter<std::string::String>,
-            std::vec::IntoIter<std::string::String>,
-        >,
-    ) {
-        let f: File = File::create(output).expect("Unable to create file");
-        let mut f: BufWriter<File> = BufWriter::new(f);
-        for map_pair in mapped {
-            let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1);
-            f.write_all(&line.into_bytes())
-                .expect("Unable to write data");
-        }
-    }
-
-    #[allow(unused_mut)]
-    pub fn create_mapped_fasta(
-        input: &str,
-        output: &str,
-        mapped: Zip<
-            std::vec::IntoIter<std::string::String>,
-            std::vec::IntoIter<std::string::String>,
-        >,
-    ) {
-        let file_reader: File = File::open(input).expect("CAN'T OPEN FILE");
-        let buff_reader: BufReader<File> = BufReader::new(file_reader);
-        let mut new_fasta: File = File::create(output).unwrap();
-
-        for line in buff_reader.lines() {
-            let l: &str = &line.as_ref().unwrap()[..];
-            if l.starts_with('>') {
-                let mut to_replace = l.replace('>', "");
-                let mut mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-                    mapped.clone();
-                let mut map: Option<(String, String)> =
-                    mapped_heads.find(|x: &(String, String)| x.0 == to_replace);
-                let mut new_head: String = map.expect("").1;
-                let fmt_head: String = format!(">{}\n", new_head);
-                let _ = new_fasta.write_all(&fmt_head.into_bytes());
-            } else {
-                let mut seq = line.expect("");
-                let fmt_seq = format!("{}\n", seq);
-                let _ = new_fasta.write_all(&fmt_seq.into_bytes());
-            }
-        }
-    }
-
-    pub fn map_fasta_head(
-        arguments: std::option::Option<&ArgMatches>,
-    ) -> Result<(), Box<dyn Error>> {
-        let file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
-        let replacer: &String = arguments
-            .unwrap()
-            .get_one::<String>("replace-with")
-            .unwrap();
-        let output: &String = arguments
-            .unwrap()
-            .get_one::<String>("output-directory")
-            .unwrap();
-
-        println!("Mapping headers for file: {}", file);
-        println!("Replace headers with string: {:?}", &replacer);
-
-        match validate_fasta(file) {
-            Ok(names) => {
-                let new_names = Vec::from_iter(only_keys(names));
-
-                let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-                    create_mapping(new_names, replacer);
-
-                let map_to_save: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-                    new_map.clone();
-                let output_file = format!("{}mapped-heads.tsv", output);
-
-                save_mapping(&output_file, map_to_save);
-
-                let new_fasta: String = format!("{output}mapped.fasta");
-
-                create_mapped_fasta(file, &new_fasta, new_map);
-
-                println!(
-                    "{}\n{}\n\t{}\n\t{}",
-                    "FASTA HAS BEEN MAPPED AND REWRITTEN".green(),
-                    "FOUND HERE:".green(),
-                    &new_fasta.green(),
-                    &output_file.green()
-                );
-            }
-
-            Err(e) => panic!("Something is wrong with the file! | {}", e),
-        };
-
-        Ok(())
-    }
-}
diff --git a/src/processors.rs b/src/processors.rs
new file mode 100644
index 0000000..0c1ad7f
--- /dev/null
+++ b/src/processors.rs
@@ -0,0 +1,7 @@
+pub mod yaml_validator;
+pub mod split_by_count;
+pub mod split_by_size;
+pub mod map_headers;
+pub mod remap_head;
+pub mod tpf_fasta;
+pub mod exclude_seq;
diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs
new file mode 100644
index 0000000..89ecc8d
--- /dev/null
+++ b/src/processors/exclude_seq.rs
@@ -0,0 +1,40 @@
+use clap::ArgMatches;
+use noodles::fasta;
+use std::error::Error;
+use std::{fs, io::BufRead, str};
+
+fn open_fasta<'a>(
+    exclusions: Vec<&str>,
+    fasta: &'a str,
+    out_file: &str,
+) -> std::result::Result<&'a str, Box<dyn Error>> {
+    let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
+        fasta::reader::Builder.build_from_path(fasta);
+    let file = fs::OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(out_file)?;
+    let mut writer = fasta::Writer::new(file);
+
+    match reader {
+        Ok(fasta) => {
+            let mut binding = fasta;
+            for result in binding.records() {
+                let record = result?;
+                if !exclusions.contains(&record.name()) {
+                    writer.write_record(&record)?;
+                } else {
+                    println!("Found record to exclude: {:?}", &record.name());
+                }
+            }
+            Ok("Removed Exclusionary List")
+        }
+        Err(_) => Err("Error: Fasta is not valid check file!".into()),
+    }
+}
+
+pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) {
+    let list_to_exclude = exclude.split(',').collect::<Vec<&str>>();
+    let _x = open_fasta(list_to_exclude, fasta, outfile);
+}
+
diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs
new file mode 100644
index 0000000..18a9847
--- /dev/null
+++ b/src/processors/map_headers.rs
@@ -0,0 +1,131 @@
+use clap::ArgMatches;
+use colored::Colorize;
+use std::error::Error;
+use std::fmt;
+use std::fs::File;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+use std::iter::Zip;
+
+use crate::generics::only_keys;
+use crate::generics::validate_fasta;
+
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+struct EmptyVec;
+impl Error for EmptyVec {}
+
+impl fmt::Display for EmptyVec {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Can't Display Empty Vec")
+    }
+}
+
+#[allow(clippy::explicit_counter_loop)]
+pub fn create_mapping(
+    name_vec: Vec<std::string::String>,
+    new_name: &str,
+) -> Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>> {
+    // Generate a new mapping for the Fasta
+    //
+    let mut new_heads: Vec<String> = Vec::new();
+    let mut head_counter: i32 = 0;
+    let name_vec_clone = name_vec.clone();
+
+    for _x in name_vec {
+        new_heads.push(format!("{}_{}", new_name, head_counter));
+        head_counter += 1;
+    }
+
+    let mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+        name_vec_clone.into_iter().zip(new_heads);
+
+    mapped_heads
+}
+
+pub fn save_mapping(
+    output: &str,
+    mapped: Zip<
+        std::vec::IntoIter<std::string::String>,
+        std::vec::IntoIter<std::string::String>,
+    >,
+) {
+    let f: File = File::create(output).expect("Unable to create file");
+    let mut f: BufWriter<File> = BufWriter::new(f);
+    for map_pair in mapped {
+        let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1);
+        f.write_all(&line.into_bytes())
+            .expect("Unable to write data");
+    }
+}
+
+#[allow(unused_mut)]
+pub fn create_mapped_fasta(
+    input: &str,
+    output: &str,
+    mapped: Zip<
+        std::vec::IntoIter<std::string::String>,
+        std::vec::IntoIter<std::string::String>,
+    >,
+) {
+    let file_reader: File = File::open(input).expect("CAN'T OPEN FILE");
+    let buff_reader: BufReader<File> = BufReader::new(file_reader);
+    let mut new_fasta: File = File::create(output).unwrap();
+
+    for line in buff_reader.lines() {
+        let l: &str = &line.as_ref().unwrap()[..];
+        if l.starts_with('>') {
+            let mut to_replace = l.replace('>', "");
+            let mut mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+                mapped.clone();
+            let mut map: Option<(String, String)> =
+                mapped_heads.find(|x: &(String, String)| x.0 == to_replace);
+            let mut new_head: String = map.expect("").1;
+            let fmt_head: String = format!(">{}\n", new_head);
+            let _ = new_fasta.write_all(&fmt_head.into_bytes());
+        } else {
+            let mut seq = line.expect("");
+            let fmt_seq = format!("{}\n", seq);
+            let _ = new_fasta.write_all(&fmt_seq.into_bytes());
+        }
+    }
+}
+
+pub fn map_fasta_head(
+    file: &String, output: &String, replacer: &String
+) -> Result<(), Box<dyn Error>> {
+
+    println!("Mapping headers for file: {}", file);
+    println!("Replace headers with string: {:?}", &replacer);
+
+    match validate_fasta(file) {
+        Ok(names) => {
+            let new_names = Vec::from_iter(only_keys(names));
+
+            let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+                create_mapping(new_names, replacer);
+
+            let map_to_save: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+                new_map.clone();
+            let output_file = format!("{}mapped-heads.tsv", output);
+
+            save_mapping(&output_file, map_to_save);
+
+            let new_fasta: String = format!("{output}mapped.fasta");
+
+            create_mapped_fasta(file, &new_fasta, new_map);
+
+            println!(
+                "{}\n{}\n\t{}\n\t{}",
+                "FASTA HAS BEEN MAPPED AND REWRITTEN".green(),
+                "FOUND HERE:".green(),
+                &new_fasta.green(),
+                &output_file.green()
+            );
+        }
+
+        Err(e) => panic!("Something is wrong with the file! | {}", e),
+    };
+
+    Ok(())
+}
+
diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs
new file mode 100644
index 0000000..44ef5d5
--- /dev/null
+++ b/src/processors/remap_head.rs
@@ -0,0 +1,66 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::iter::Zip;
+
+use colored::Colorize;
+
+use crate::generics::validate_fasta;
+use crate::processors::map_headers::create_mapped_fasta;
+
+pub fn pull_map_from_tsv(
+    map_file: &str,
+) -> Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>> {
+    let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE");
+    let buff_reader: BufReader<File> = BufReader::new(file_reader);
+
+    let mut old_head: Vec<String> = Vec::new();
+    let mut new_head: Vec<String> = Vec::new();
+
+    for line in buff_reader.lines() {
+        match line {
+            Ok(string) => {
+                let mut old_new = string.split('\t');
+                let x = old_new.next().unwrap();
+                let y = old_new.next().unwrap();
+                old_head.push(x.to_string());
+                new_head.push(y.to_string());
+            }
+            Err(_) => {
+                print!("")
+            }
+        };
+    }
+
+    let mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+        new_head.into_iter().zip(old_head);
+
+    mapped_heads
+}
+
+pub fn remapping_head(file: &String, output: &String, map_file: &String) {
+
+    println!("Mapping headers for file: {}", file);
+    println!("Replace headers with string: {}", map_file);
+
+    match validate_fasta(file) {
+        Ok(_thing) => {
+            let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
+                pull_map_from_tsv(map_file);
+
+            let new_fasta: String = format!("{output}_OH.fasta");
+
+            create_mapped_fasta(file, &new_fasta, new_map);
+
+            println!(
+                "{}\n{}\n\t{}\n",
+                "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(),
+                "FOUND HERE:".green(),
+                &new_fasta.green()
+            );
+        }
+        Err(_) => {
+            println!("NOT A VALID FASTA")
+        }
+    };
+}
+
diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs
new file mode 100644
index 0000000..dda0b39
--- /dev/null
+++ b/src/processors/split_by_count.rs
@@ -0,0 +1,100 @@
+use crate::generics::sanitise_header;
+use clap::ArgMatches;
+use compare::{natural, Compare};
+use noodles::fasta::{self, Record};
+use std::cmp::Ordering;
+use std::fs::OpenOptions;
+use std::{
+    fs::{create_dir_all, File},
+    io::BufReader,
+    path::Path,
+};
+
+#[allow(clippy::needless_return)]
+fn fix_head(records: Record, sanitise: bool) -> Record {
+    if sanitise {
+        let header = sanitise_header(records.definition());
+        let definition = fasta::record::Definition::new(header, None);
+        let seq = records.sequence().to_owned();
+        return fasta::Record::new(definition, seq);
+    } else {
+        return records.to_owned();
+    };
+}
+
+fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
+    println!("{}", outdir);
+
+    let _data_file = File::create(outdir);
+    let file = OpenOptions::new()
+        .append(true)
+        .open(outdir)
+        .expect("creation failed");
+
+    let mut writer = fasta::Writer::new(file);
+    for i in fasta_record {
+        writer.write_record(i).unwrap();
+    }
+}
+
+pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_type: &String, sanitise: &bool, fasta_count: &u16) {
+    let path_obj = Path::new(fasta_file);
+    let grab_name = path_obj.file_name().unwrap();
+    let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
+    let actual_name = actual_list[0];
+
+    let new_outpath = format!("{}/{}/{}/", output_directory, actual_name, data_type);
+    create_dir_all(new_outpath.clone()).unwrap();
+    println!(
+        "Fasta file for processing: {:?}\nNumber of records per file: {:?}",
+        fasta_file, fasta_count
+    );
+
+    let mut counter: u16 = 0;
+    let mut file_counter: u16 = 1;
+
+    let file_name: Vec<&str> = actual_name.split('.').collect();
+
+    let mut reader = File::open(fasta_file)
+        .map(BufReader::new)
+        .map(fasta::Reader::new)
+        .unwrap();
+
+    let mut record_list: Vec<Record> = Vec::new();
+    for result in reader.records() {
+        let record = result.unwrap();
+        counter += 1;
+
+        let final_rec = fix_head(record, *sanitise);
+        record_list.push(final_rec);
+
+        let cmp = natural();
+        let compared = cmp.compare(&counter, fasta_count);
+        if compared == Ordering::Equal {
+            let full_outpath = format!(
+                "{}{}_f{}_c{}-a{}.fa",
+                new_outpath,
+                file_name[0],
+                file_counter,
+                &fasta_count,
+                &record_list.len()
+            );
+
+            write_fasta(&full_outpath, &record_list);
+            file_counter += 1;
+            counter = 0;
+            record_list = Vec::new();
+        }
+    }
+
+    let full_outpath = format!(
+        "{}{}_f{}_c{}-a{}.fa",
+        new_outpath,
+        file_name[0],
+        file_counter,
+        &fasta_count,
+        &record_list.len()
+    );
+    write_fasta(&full_outpath, &record_list);
+}
+
diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs
new file mode 100644
index 0000000..5a65719
--- /dev/null
+++ b/src/processors/split_by_size.rs
@@ -0,0 +1,10 @@
+use clap::ArgMatches;
+
+pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) {
+    println!("Fasta file for processing: {:?}", &fasta_file);
+    println!(
+        "Size to chunk fasta into: {:?}",
+        mem_size
+    );
+}
+
diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs
new file mode 100644
index 0000000..f8863b3
--- /dev/null
+++ b/src/processors/tpf_fasta.rs
@@ -0,0 +1,272 @@
+use std::{fs::File, fs::read_to_string, str};
+use std::fs::OpenOptions;
+use std::io::Write;
+
+use noodles::core::Position;
+use noodles::fasta;
+use noodles::fasta::record::Sequence;
+use noodles::fasta::repository::adapters::IndexedReader;
+
+use crate::generics::validate_fasta;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct Tpf {
+    ori_scaffold: String,
+    start_coord: usize,
+    end_coord: usize,
+    new_scaffold: String,
+    orientation: String,
+}
+
+impl std::fmt::Display for Tpf {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(
+            fmt,
+            "\t{} -- {} -- {}",
+            self.ori_scaffold, self.start_coord, self.end_coord
+        )
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+struct NewFasta {
+    tpf: Tpf,
+    sequence: String,
+}
+
+#[derive(Debug)]
+struct MyRecord {
+    name: String,
+    sequence: Vec<String>,
+}
+
+fn parse_tpf(path: &String) -> Vec<Tpf> {
+    let mut all_tpf: Vec<Tpf> = Vec::new();
+    for line in read_to_string(path).unwrap().lines() {
+        if line.starts_with('?') {
+            let line_replaced = line.replace('\t', " ");
+            let line_list: Vec<&str> = line_replaced.split_whitespace().collect();
+            let scaff_data: Vec<&str> = line_list[1].split(':').collect();
+            let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect();
+            let data = Tpf {
+                ori_scaffold: scaff_data[0].to_owned(),
+                start_coord: scaff_coords[0].to_owned().parse::<usize>().unwrap(),
+                end_coord: scaff_coords[1].to_owned().parse::<usize>().unwrap(),
+                new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"),
+                orientation: line_list[3].to_owned(),
+            };
+            all_tpf.push(data);
+        }
+    }
+    all_tpf
+}
+
+fn subset_vec_tpf<'a>(
+    tpf: &'a Vec<Tpf>,
+    fasta: (&std::string::String, &usize),
+) -> Vec<&'a Tpf> {
+    //
+    // Subset the Vec<TPF> based on a search through the fasta
+    //
+    let mut subset_tpf: Vec<&Tpf> = Vec::new();
+    for i in tpf {
+        if i.ori_scaffold == *fasta.0 {
+            subset_tpf.push(i)
+        }
+    }
+    subset_tpf
+}
+
+fn check_orientation(
+    parsed: std::option::Option<noodles::fasta::record::Sequence>,
+    orientation: String,
+) -> String {
+    if orientation == "MINUS" {
+        let start = Position::try_from(1).unwrap();
+        let parse_orientation = parsed.unwrap();
+        let compliment: Sequence = parse_orientation
+            .complement()
+            .collect::<Result<_, _>>()
+            .unwrap();
+        let seq = compliment.get(start..).unwrap();
+        str::from_utf8(seq).unwrap().chars().rev().collect()
+    } else {
+        let start = Position::try_from(1).unwrap();
+        let parse_orientation = parsed.unwrap();
+        let seq = parse_orientation.get(start..).unwrap();
+        str::from_utf8(seq).unwrap().chars().collect()
+    }
+}
+
+fn parse_seq(
+    sequence: std::option::Option<noodles::fasta::record::Sequence>,
+    tpf: Vec<&Tpf>,
+) -> Vec<NewFasta> {
+    let mut subset_tpf: Vec<NewFasta> = Vec::new();
+    //
+    // Take the input sequence and scaffold name
+    // Parse the input sequence based on the data contained in
+    // the TPF. Which is already a subset based on scaff name
+    //
+
+    let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence ()
+    for &i in &tpf {
+        let start = Position::try_from(i.start_coord).unwrap();
+        let end = Position::try_from(i.end_coord).unwrap();
+        //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap());
+        let parsed = new_seq.slice(start..=end);
+        let the_sequence = check_orientation(parsed, i.orientation.to_owned());
+        let data = NewFasta {
+            tpf: i.to_owned(),
+            sequence: the_sequence,
+        };
+        subset_tpf.push(data);
+    }
+    subset_tpf
+}
+
+fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
+    let mut uniques: Vec<String> = Vec::new();
+
+    for i in tpf_list {
+        if !uniques.contains(&i.new_scaffold) {
+            uniques.push(i.new_scaffold.to_owned())
+        }
+    }
+    uniques
+}
+
+fn save_to_fasta(
+    fasta_data: Vec<NewFasta>,
+    tpf_data: Vec<Tpf>,
+    output: &String,
+    n_length: usize,
+) {
+    //
+    // TPF is in the input TPF order, this will continue to be the case until
+    // the script is modified and the Tpf struct gets modified in place for some reason
+    //
+    let _data_file = File::create(output);
+    let mut file = OpenOptions::new()
+        .write(true)
+        .open(output)
+        .expect("creation failed");
+
+    let _debugger = File::create("debug.txt");
+    let mut file2 = OpenOptions::new()
+        .write(true)
+        .open("debug.txt")
+        .expect("creation failed");
+
+    let uniques = get_uniques(&tpf_data);
+
+    // This is inefficient as we are scanning through the fasta_data, uniques number of times
+    // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total.
+    let mut no_more: Vec<String> = Vec::new();
+    for x in uniques {
+        println!("NOW WRITING DATA FOR: {:?}", &x);
+        // X = "SUPER_1"
+        let stringy = format!(">{x}\n");
+        file.write_all(stringy.as_bytes())
+            .expect("Unable to write to file");
+        file2
+            .write_all(stringy.as_bytes())
+            .expect("Unable to write to file");
+
+        let mut data: MyRecord = MyRecord {
+            name: "".to_string(),
+            sequence: Vec::new(),
+        };
+
+        no_more.push(x.to_owned());
+        x.clone_into(&mut data.name);
+        for tpf in &tpf_data {
+            if tpf.new_scaffold == x {
+                for fasta in &fasta_data {
+                    if fasta.tpf == *tpf {
+                        let stringy = format!("\t{}\n", tpf);
+                        file2
+                            .write_all(stringy.as_bytes())
+                            .expect("Unable to write to file");
+                        data.sequence.push(fasta.sequence.to_owned());
+                    }
+                }
+            }
+        }
+
+        let line_len: usize = 60;
+        let fixed = data.sequence;
+        let n_string = "N".repeat(n_length);
+        let fixed2 = fixed.join(&n_string); //.join required a borrowed str
+        let fixed3 = fixed2
+            .as_bytes()
+            .chunks(line_len)
+            .map(str::from_utf8)
+            .collect::<Result<Vec<&str>, _>>()
+            .unwrap();
+
+        for i in fixed3 {
+            let formatted = i.to_owned() + "\n";
+            file.write_all(formatted.as_bytes()).unwrap();
+        }
+        println!("NO LONG SCANNING FOR: {:?}", &no_more)
+    }
+}
+
+#[allow(clippy::needless_borrow)]
+#[allow(clippy::let_and_return)]
+pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: &String, n_length: &usize) {
+    //
+    // Generate a curated fasta file based on the input TPF file
+    // which was generated by Pretext and the agp_to_tpf script.
+    // This new fasta file contains a new scaffold naming as well
+    // as pieced together sequences generated by the splitting of
+    // data in Pretext.
+    //
+    println!("LET'S GET CURATING THAT FASTA!");
+    stacker::maybe_grow(32 * 1024, 1024 * 5120, || {
+        match validate_fasta(fasta_file) {
+            Ok(fasta_d) => {
+                let tpf_data = parse_tpf(&tpf_file);
+                //let _validated = varify_validity(&tpf_data, &fasta_d);
+
+                //
+                // Start indexed reader of the input fasta
+                // if valid then use the data
+                //
+                let reader =
+                    fasta::indexed_reader::Builder::default().build_from_path(fasta_file);
+                let fasta_repo = match reader {
+                    Ok(data) => {
+                        let adapter = IndexedReader::new(data);
+                        let repository = fasta::Repository::new(adapter);
+                        repository
+                    }
+                    Err(_) => todo!(),
+                };
+
+                //
+                // For unique scaffold in the fasta file iter through and
+                // parse sequence for each line in the tpf
+                // The tpf will contain multiple enteries for each scaffold, minimum of one entry.
+                //
+                let mut new_fasta_data: Vec<NewFasta> = Vec::new();
+                for i in fasta_d {
+                    let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1));
+                    let sequence = fasta_repo.get(&i.0).transpose();
+
+                    match sequence {
+                        Ok(data) => {
+                            let subset_results = parse_seq(data, subset_tpf);
+                            new_fasta_data.extend(subset_results);
+                        }
+                        Err(e) => panic!("{:?}", e),
+                    };
+                }
+                save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned())
+            }
+            Err(e) => panic!("Something is wrong with the file! | {}", e),
+        }
+    })
+}
+
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
new file mode 100644
index 0000000..d2a08f4
--- /dev/null
+++ b/src/processors/yaml_validator.rs
@@ -0,0 +1,265 @@
+use std::fs::{self, File};
+use std::io::ErrorKind;
+use std::path::PathBuf;
+
+use colored::Colorize;
+use csv::Error;
+use csv::ReaderBuilder;
+use noodles::fasta;
+use serde::{Deserialize, Serialize};
+
+// Would be nice if there was a simple format_check
+// use noodles::cram as cram;
+
+#[derive(Debug, Serialize, Deserialize)]
+struct TreeValYaml {
+    assembly: Assembly,
+    reference_file: String,
+    assem_reads: AssemReads,
+    alignment: Alignment,
+    self_comp: SelfComp,
+    intron: Intron,
+    telomere: Telomere,
+    synteny: Synteny,
+    busco: Busco,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Assembly {
+    level: String,
+    sample_id: String,
+    latin_name: String,
+    classT: String,
+    asmVersion: u16,
+    gevalType: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct AssemReads {
+    pacbio: String,
+    hic: String,
+    supplementary: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Alignment {
+    data_dir: String,
+    common_name: String,
+    geneset: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct SelfComp {
+    motif_len: u16,
+    mummer_chunk: u16,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Intron {
+    size: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Telomere {
+    teloseq: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Synteny {
+    synteny_genome_path: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Busco {
+    lineages_path: String,
+    lineage: String,
+}
+
+//
+// CSV STRUCT
+//
+//#[derive(Deserialize)]
+//struct Record {
+//    org: String,
+//    type: String,
+//    data_file: String
+//}
+
+pub fn validate_paths(path: &str, field_id: &str) {
+    match fs::metadata(path) {
+        Ok(_) => {
+            println!(
+                "{}{}   \t{}\t{}",
+                ">-".green(),
+                &field_id.green(),
+                "| PATH EXISTS: ".green(),
+                path.green()
+            );
+            match field_id {
+                "REFERENCE" => validate_fasta(path),
+                "GENESET-CSV" => {
+                    _ = validate_csv(path);
+                }
+                "HIC" => {}
+                _ => println!("Error"),
+            }
+        }
+        Err(_) => println!(
+            "{}{}   \t{}\t{}",
+            "<-".red().bold(),
+            &field_id.red().bold(),
+            "| CHECK YAML!:".red().bold(),
+            path
+        ),
+    }
+}
+
+pub fn validate_fasta(path: &str) {
+    let reader = fasta::reader::Builder.build_from_path(path);
+
+    let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS");
+    let result = binding.records();
+    let counter = result.count();
+    println!(
+        "{} {} {}",
+        ">- REFERENCE H/S PAIRS:".green(),
+        counter,
+        "H/S PAIRS".green()
+    )
+}
+
+pub fn validate_csv(path: &str) -> Result<(), Error> {
+    let file = File::open(path)?;
+
+    let mut reader = ReaderBuilder::new()
+        .has_headers(true)
+        .delimiter(b',')
+        .from_reader(file);
+
+    let record = reader.records().count();
+    println!(
+        "{} {} {}",
+        ">-GENESET-RECORD-COUNT: >".green(),
+        record,
+        "<".green()
+    );
+
+    Ok(())
+}
+
+//
+// FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta
+//           could make this much easier and consise by passing in a list of file types to check
+//           validatedata(path, [fa, fna, fasta])
+//
+pub fn validate_data(path: &str, dtype: &str) {
+    match fs::read_dir(path) {
+        Err(e) if e.kind() == ErrorKind::NotFound => {}
+        Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()),
+        Ok(data_files) => {
+            if dtype == "pacbio" {
+                let files: Vec<PathBuf> = data_files
+                    .filter_map(|f| f.ok())
+                    .filter(|d| match d.path().extension() {
+                        None => false,
+                        Some(ex) => ex == "fasta.gz",
+                    })
+                    .map(|f| f.path())
+                    .collect();
+
+                if files.is_empty() {
+                    println!("{}", "<-NO PACBIO DATA FILES".red())
+                } else {
+                    println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files);
+                }
+            } else if dtype == "hic" {
+                let files: Vec<PathBuf> = data_files
+                    .filter_map(|f| f.ok())
+                    .filter(|d| match d.path().extension() {
+                        None => false,
+                        Some(ex) => ex == "cram" || ex == "crai",
+                    })
+                    .map(|f| f.path())
+                    .collect();
+
+                if files.is_empty() {
+                    println!("{}", "<-NO HIC DATA FILES".red())
+                } else {
+                    println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files);
+                }
+            } else if dtype == "synteny" {
+                let files: Vec<PathBuf> = data_files
+                    .filter_map(|f| f.ok())
+                    .filter(|d| match d.path().extension() {
+                        None => false,
+                        Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna",
+                    })
+                    .map(|f| f.path())
+                    .collect();
+
+                if files.is_empty() {
+                    println!("{}", "<-NO SYNTENIC GENOMES".red())
+                } else {
+                    println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files);
+                }
+            }
+        }
+    };
+}
+
+pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
+
+    println! {"Validating Yaml: {}", file.purple()};
+
+    let input = fs::File::open(file).expect("Unable to read from file");
+    let contents: TreeValYaml =
+        serde_yaml::from_reader(input).expect("Unable to read from file");
+
+    println!(
+        "RUNNING VALIDATE-YAML FOR SAMPLE: {}",
+        contents.assembly.sample_id.purple()
+    );
+
+    validate_paths(&contents.reference_file, "REFERENCE");
+    validate_paths(&contents.alignment.data_dir, "GENESET");
+    validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY");
+    validate_paths(&contents.busco.lineages_path, "BUSCO");
+
+    validate_paths(&contents.assem_reads.pacbio, "PACBIO");
+    validate_data(&contents.assem_reads.pacbio, "pacbio");
+
+    validate_paths(&contents.assem_reads.hic, "HIC");
+    validate_data(&contents.assem_reads.hic, "hic");
+
+    println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue());
+    let genesets = contents.alignment.geneset.split(',');
+    for set in genesets {
+        let gene_alignment_path = contents.alignment.data_dir.clone()
+            + &contents.assembly.classT
+            + "/csv_data/"
+            + set
+            + "-data.csv";
+        validate_paths(&gene_alignment_path, "GENESET-CSV");
+    }
+
+    println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue());
+    let synteny_full =
+        contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/";
+    validate_paths(&synteny_full, "SYNTENY-FASTA");
+    validate_data(&synteny_full, "synteny");
+
+    println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue());
+    let busco_path =
+        contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage;
+    validate_paths(&busco_path, "BUSCO-DB");
+    // NOW CHECK FOR FILES IN DIRECTORY?
+
+    println!(
+        "{}\n{}\n{}\n{}\n{}",
+        "VALIDATION COMPLETE".purple().bold(),
+        "GENERAL INFORMATION:".purple().bold(),
+        "Check the log to see what failed".bold(),
+        "FULL : ONLY synteny fails are permitted".purple(),
+        "RAPID: geneset, busco and synteny fails are permitted".purple()
+    );
+}
diff --git a/src/remap_head.rs b/src/remap_head.rs
deleted file mode 100644
index 83e20ef..0000000
--- a/src/remap_head.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-pub mod remapping_headers {
-    use crate::map_headers;
-    use clap::ArgMatches;
-    use colored::Colorize;
-    use std::fs::File;
-    use std::io::{BufRead, BufReader};
-    use std::iter::Zip;
-
-    use crate::generics::validate_fasta;
-
-    pub fn pull_map_from_tsv(
-        map_file: &str,
-    ) -> Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>> {
-        let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE");
-        let buff_reader: BufReader<File> = BufReader::new(file_reader);
-
-        let mut old_head: Vec<String> = Vec::new();
-        let mut new_head: Vec<String> = Vec::new();
-
-        for line in buff_reader.lines() {
-            match line {
-                Ok(string) => {
-                    let mut old_new = string.split('\t');
-                    let x = old_new.next().unwrap();
-                    let y = old_new.next().unwrap();
-                    old_head.push(x.to_string());
-                    new_head.push(y.to_string());
-                }
-                Err(_) => {
-                    print!("")
-                }
-            };
-        }
-
-        let mapped_heads: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-            new_head.into_iter().zip(old_head);
-
-        mapped_heads
-    }
-
-    pub fn remapping_head(arguments: std::option::Option<&ArgMatches>) {
-        let file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
-        let map_file: &String = arguments.unwrap().get_one::<String>("map-file").unwrap();
-        let output: &String = arguments
-            .unwrap()
-            .get_one::<String>("output-directory")
-            .unwrap();
-
-        println!("Mapping headers for file: {}", file);
-        println!("Replace headers with string: {}", map_file);
-
-        match validate_fasta(file) {
-            Ok(_thing) => {
-                let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
-                    pull_map_from_tsv(map_file);
-
-                let new_fasta: String = format!("{output}_OH.fasta");
-
-                map_headers::mapping_headers::create_mapped_fasta(file, &new_fasta, new_map);
-
-                println!(
-                    "{}\n{}\n\t{}\n",
-                    "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(),
-                    "FOUND HERE:".green(),
-                    &new_fasta.green()
-                );
-            }
-            Err(_) => {
-                println!("NOT A VALID FASTA")
-            }
-        };
-    }
-}
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
deleted file mode 100644
index 1396f00..0000000
--- a/src/split_by_count.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-pub mod split_by_count_mod {
-    use crate::generics::sanitise_header;
-    use clap::ArgMatches;
-    use compare::{natural, Compare};
-    use noodles::fasta::{self, Record};
-    use std::cmp::Ordering;
-    use std::fs::OpenOptions;
-    use std::{
-        fs::{create_dir_all, File},
-        io::BufReader,
-        path::Path,
-    };
-
-    #[allow(clippy::needless_return)]
-    fn fix_head(records: Record, sanitise: bool) -> Record {
-        if sanitise {
-            let header = sanitise_header(records.definition());
-            let definition = fasta::record::Definition::new(header, None);
-            let seq = records.sequence().to_owned();
-            return fasta::Record::new(definition, seq);
-        } else {
-            return records.to_owned();
-        };
-    }
-
-    fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
-        println!("{}", outdir);
-
-        let _data_file = File::create(outdir);
-        let file = OpenOptions::new()
-            .append(true)
-            .open(outdir)
-            .expect("creation failed");
-
-        let mut writer = fasta::Writer::new(file);
-        for i in fasta_record {
-            writer.write_record(i).unwrap();
-        }
-    }
-
-    pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
-        let sanitise: &bool = arguments.unwrap().get_one::<bool>("sanitise").unwrap();
-        let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
-        let path_obj = Path::new(fasta_file);
-        let grab_name = path_obj.file_name().unwrap();
-        let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
-        let actual_name = actual_list[0];
-
-        let data_type = arguments.unwrap().get_one::<String>("data_type").unwrap();
-
-        let outpath = arguments
-            .unwrap()
-            .get_one::<String>("output-directory")
-            .unwrap();
-
-        let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
-        create_dir_all(new_outpath.clone()).unwrap();
-        let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
-        println!(
-            "Fasta file for processing: {:?}\nNumber of records per file: {:?}",
-            fasta_file, fasta_count
-        );
-
-        let mut counter: u16 = 0;
-        let mut file_counter: u16 = 1;
-
-        let file_name: Vec<&str> = actual_name.split('.').collect();
-
-        let mut reader = File::open(fasta_file)
-            .map(BufReader::new)
-            .map(fasta::Reader::new)
-            .unwrap();
-
-        let mut record_list: Vec<Record> = Vec::new();
-        for result in reader.records() {
-            let record = result.unwrap();
-            counter += 1;
-
-            let final_rec = fix_head(record, *sanitise);
-            record_list.push(final_rec);
-
-            let cmp = natural();
-            let compared = cmp.compare(&counter, fasta_count);
-            if compared == Ordering::Equal {
-                let full_outpath = format!(
-                    "{}{}_f{}_c{}-a{}.fa",
-                    new_outpath,
-                    file_name[0],
-                    file_counter,
-                    &fasta_count,
-                    &record_list.len()
-                );
-
-                write_fasta(&full_outpath, &record_list);
-                file_counter += 1;
-                counter = 0;
-                record_list = Vec::new();
-            }
-        }
-
-        let full_outpath = format!(
-            "{}{}_f{}_c{}-a{}.fa",
-            new_outpath,
-            file_name[0],
-            file_counter,
-            &fasta_count,
-            &record_list.len()
-        );
-        write_fasta(&full_outpath, &record_list);
-    }
-}
diff --git a/src/split_by_size.rs b/src/split_by_size.rs
deleted file mode 100644
index f1b4a7b..0000000
--- a/src/split_by_size.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-pub mod split_by_size_mod {
-    use clap::ArgMatches;
-
-    pub fn split_file_by_size(arguments: std::option::Option<&ArgMatches>) {
-        let fasta_file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
-        println!("Fasta file for processing: {:?}", &fasta_file);
-        println!(
-            "Size to chunk fasta into: {:?}",
-            arguments.unwrap().get_one::<u16>("mem-size").unwrap()
-        );
-    }
-}
diff --git a/src/tpf_fasta.rs b/src/tpf_fasta.rs
deleted file mode 100644
index fc5ec7e..0000000
--- a/src/tpf_fasta.rs
+++ /dev/null
@@ -1,277 +0,0 @@
-pub mod tpf_fasta_mod {
-    use clap::ArgMatches;
-    use noodles::core::Position;
-    use noodles::fasta;
-    use noodles::fasta::record::Sequence;
-    use noodles::fasta::repository::adapters::IndexedReader;
-    use std::fs::OpenOptions;
-    use std::io::Write;
-    use std::{fs::read_to_string, fs::File, str};
-
-    use crate::generics::validate_fasta;
-
-    #[derive(Debug, Clone, PartialEq, Eq)]
-    struct Tpf {
-        ori_scaffold: String,
-        start_coord: usize,
-        end_coord: usize,
-        new_scaffold: String,
-        orientation: String,
-    }
-
-    impl std::fmt::Display for Tpf {
-        fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-            write!(
-                fmt,
-                "\t{} -- {} -- {}",
-                self.ori_scaffold, self.start_coord, self.end_coord
-            )
-        }
-    }
-
-    #[derive(Debug, PartialEq, Eq)]
-    struct NewFasta {
-        tpf: Tpf,
-        sequence: String,
-    }
-
-    #[derive(Debug)]
-    struct MyRecord {
-        name: String,
-        sequence: Vec<String>,
-    }
-
-    fn parse_tpf(path: &String) -> Vec<Tpf> {
-        let mut all_tpf: Vec<Tpf> = Vec::new();
-        for line in read_to_string(path).unwrap().lines() {
-            if line.starts_with('?') {
-                let line_replaced = line.replace('\t', " ");
-                let line_list: Vec<&str> = line_replaced.split_whitespace().collect();
-                let scaff_data: Vec<&str> = line_list[1].split(':').collect();
-                let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect();
-                let data = Tpf {
-                    ori_scaffold: scaff_data[0].to_owned(),
-                    start_coord: scaff_coords[0].to_owned().parse::<usize>().unwrap(),
-                    end_coord: scaff_coords[1].to_owned().parse::<usize>().unwrap(),
-                    new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"),
-                    orientation: line_list[3].to_owned(),
-                };
-                all_tpf.push(data);
-            }
-        }
-        all_tpf
-    }
-
-    fn subset_vec_tpf<'a>(
-        tpf: &'a Vec<Tpf>,
-        fasta: (&std::string::String, &usize),
-    ) -> Vec<&'a Tpf> {
-        //
-        // Subset the Vec<TPF> based on a search through the fasta
-        //
-        let mut subset_tpf: Vec<&Tpf> = Vec::new();
-        for i in tpf {
-            if i.ori_scaffold == *fasta.0 {
-                subset_tpf.push(i)
-            }
-        }
-        subset_tpf
-    }
-
-    fn check_orientation(
-        parsed: std::option::Option<noodles::fasta::record::Sequence>,
-        orientation: String,
-    ) -> String {
-        if orientation == "MINUS" {
-            let start = Position::try_from(1).unwrap();
-            let parse_orientation = parsed.unwrap();
-            let compliment: Sequence = parse_orientation
-                .complement()
-                .collect::<Result<_, _>>()
-                .unwrap();
-            let seq = compliment.get(start..).unwrap();
-            str::from_utf8(seq).unwrap().chars().rev().collect()
-        } else {
-            let start = Position::try_from(1).unwrap();
-            let parse_orientation = parsed.unwrap();
-            let seq = parse_orientation.get(start..).unwrap();
-            str::from_utf8(seq).unwrap().chars().collect()
-        }
-    }
-
-    fn parse_seq(
-        sequence: std::option::Option<noodles::fasta::record::Sequence>,
-        tpf: Vec<&Tpf>,
-    ) -> Vec<NewFasta> {
-        let mut subset_tpf: Vec<NewFasta> = Vec::new();
-        //
-        // Take the input sequence and scaffold name
-        // Parse the input sequence based on the data contained in
-        // the TPF. Which is already a subset based on scaff name
-        //
-
-        let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence ()
-        for &i in &tpf {
-            let start = Position::try_from(i.start_coord).unwrap();
-            let end = Position::try_from(i.end_coord).unwrap();
-            //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap());
-            let parsed = new_seq.slice(start..=end);
-            let the_sequence = check_orientation(parsed, i.orientation.to_owned());
-            let data = NewFasta {
-                tpf: i.to_owned(),
-                sequence: the_sequence,
-            };
-            subset_tpf.push(data);
-        }
-        subset_tpf
-    }
-
-    fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
-        let mut uniques: Vec<String> = Vec::new();
-
-        for i in tpf_list {
-            if !uniques.contains(&i.new_scaffold) {
-                uniques.push(i.new_scaffold.to_owned())
-            }
-        }
-        uniques
-    }
-
-    fn save_to_fasta(
-        fasta_data: Vec<NewFasta>,
-        tpf_data: Vec<Tpf>,
-        output: &String,
-        n_length: usize,
-    ) {
-        //
-        // TPF is in the input TPF order, this will continue to be the case until
-        // the script is modified and the Tpf struct gets modified in place for some reason
-        //
-        let _data_file = File::create(output);
-        let mut file = OpenOptions::new()
-            .write(true)
-            .open(output)
-            .expect("creation failed");
-
-        let _debugger = File::create("debug.txt");
-        let mut file2 = OpenOptions::new()
-            .write(true)
-            .open("debug.txt")
-            .expect("creation failed");
-
-        let uniques = get_uniques(&tpf_data);
-
-        // This is inefficient as we are scanning through the fasta_data, uniques number of times
-        // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total.
-        let mut no_more: Vec<String> = Vec::new();
-        for x in uniques {
-            println!("NOW WRITING DATA FOR: {:?}", &x);
-            // X = "SUPER_1"
-            let stringy = format!(">{x}\n");
-            file.write_all(stringy.as_bytes())
-                .expect("Unable to write to file");
-            file2
-                .write_all(stringy.as_bytes())
-                .expect("Unable to write to file");
-
-            let mut data: MyRecord = MyRecord {
-                name: "".to_string(),
-                sequence: Vec::new(),
-            };
-
-            no_more.push(x.to_owned());
-            x.clone_into(&mut data.name);
-            for tpf in &tpf_data {
-                if tpf.new_scaffold == x {
-                    for fasta in &fasta_data {
-                        if fasta.tpf == *tpf {
-                            let stringy = format!("\t{}\n", tpf);
-                            file2
-                                .write_all(stringy.as_bytes())
-                                .expect("Unable to write to file");
-                            data.sequence.push(fasta.sequence.to_owned());
-                        }
-                    }
-                }
-            }
-
-            let line_len: usize = 60;
-            let fixed = data.sequence;
-            let n_string = "N".repeat(n_length);
-            let fixed2 = fixed.join(&n_string); //.join required a borrowed str
-            let fixed3 = fixed2
-                .as_bytes()
-                .chunks(line_len)
-                .map(str::from_utf8)
-                .collect::<Result<Vec<&str>, _>>()
-                .unwrap();
-
-            for i in fixed3 {
-                let formatted = i.to_owned() + "\n";
-                file.write_all(formatted.as_bytes()).unwrap();
-            }
-            println!("NO LONG SCANNING FOR: {:?}", &no_more)
-        }
-    }
-
-    #[allow(clippy::needless_borrow)]
-    #[allow(clippy::let_and_return)]
-    pub fn curate_fasta(arguments: std::option::Option<&ArgMatches>) {
-        //
-        // Generate a curated fasta file based on the input TPF file
-        // which was generated by Pretext and the agp_to_tpf script.
-        // This new fasta file contains a new scaffold naming as well
-        // as pieced together sequences generated by the splitting of
-        // data in Pretext.
-        //
-        let fasta_file: &String = arguments.unwrap().get_one::<String>("fasta").unwrap();
-        let tpf_file: &String = arguments.unwrap().get_one::<String>("tpf").unwrap();
-        let n_length: &usize = arguments.unwrap().get_one::<usize>("n_length").unwrap();
-        let output: &String = arguments.unwrap().get_one::<String>("output").unwrap();
-        println!("LET'S GET CURATING THAT FASTA!");
-        stacker::maybe_grow(32 * 1024, 1024 * 5120, || {
-            match validate_fasta(fasta_file) {
-                Ok(fasta_d) => {
-                    let tpf_data = parse_tpf(&tpf_file);
-                    //let _validated = varify_validity(&tpf_data, &fasta_d);
-
-                    //
-                    // Start indexed reader of the input fasta
-                    // if valid then use the data
-                    //
-                    let reader =
-                        fasta::indexed_reader::Builder::default().build_from_path(fasta_file);
-                    let fasta_repo = match reader {
-                        Ok(data) => {
-                            let adapter = IndexedReader::new(data);
-                            let repository = fasta::Repository::new(adapter);
-                            repository
-                        }
-                        Err(_) => todo!(),
-                    };
-
-                    //
-                    // For unique scaffold in the fasta file iter through and
-                    // parse sequence for each line in the tpf
-                    // The tpf will contain multiple enteries for each scaffold, minimum of one entry.
-                    //
-                    let mut new_fasta_data: Vec<NewFasta> = Vec::new();
-                    for i in fasta_d {
-                        let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1));
-                        let sequence = fasta_repo.get(&i.0).transpose();
-
-                        match sequence {
-                            Ok(data) => {
-                                let subset_results = parse_seq(data, subset_tpf);
-                                new_fasta_data.extend(subset_results);
-                            }
-                            Err(e) => panic!("{:?}", e),
-                        };
-                    }
-                    save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned())
-                }
-                Err(e) => panic!("Something is wrong with the file! | {}", e),
-            }
-        })
-    }
-}
diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs
deleted file mode 100644
index e23d121..0000000
--- a/src/yaml_validator.rs
+++ /dev/null
@@ -1,272 +0,0 @@
-pub mod yaml_validator_mod {
-    use clap::ArgMatches;
-    use colored::Colorize;
-    use csv::Error;
-    use csv::ReaderBuilder;
-    use noodles::fasta;
-    use serde::{Deserialize, Serialize};
-    use std::fs::{self, File};
-    use std::io::ErrorKind;
-    use std::path::PathBuf;
-    // Would be nice if there was a simple format_check
-    // use noodles::cram as cram;
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct TreeValYaml {
-        assembly: Assembly,
-        reference_file: String,
-        assem_reads: AssemReads,
-        alignment: Alignment,
-        self_comp: SelfComp,
-        intron: Intron,
-        telomere: Telomere,
-        synteny: Synteny,
-        busco: Busco,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Assembly {
-        level: String,
-        sample_id: String,
-        latin_name: String,
-        classT: String,
-        asmVersion: u16,
-        gevalType: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct AssemReads {
-        pacbio: String,
-        hic: String,
-        supplementary: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Alignment {
-        data_dir: String,
-        common_name: String,
-        geneset: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct SelfComp {
-        motif_len: u16,
-        mummer_chunk: u16,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Intron {
-        size: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Telomere {
-        teloseq: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Synteny {
-        synteny_genome_path: String,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct Busco {
-        lineages_path: String,
-        lineage: String,
-    }
-
-    //
-    // CSV STRUCT
-    //
-    //#[derive(Deserialize)]
-    //struct Record {
-    //    org: String,
-    //    type: String,
-    //    data_file: String
-    //}
-
-    pub fn validate_paths(path: &str, field_id: &str) {
-        match fs::metadata(path) {
-            Ok(_) => {
-                println!(
-                    "{}{}   \t{}\t{}",
-                    ">-".green(),
-                    &field_id.green(),
-                    "| PATH EXISTS: ".green(),
-                    path.green()
-                );
-                match field_id {
-                    "REFERENCE" => validate_fasta(path),
-                    "GENESET-CSV" => {
-                        _ = validate_csv(path);
-                    }
-                    "HIC" => {}
-                    _ => println!("Error"),
-                }
-            }
-            Err(_) => println!(
-                "{}{}   \t{}\t{}",
-                "<-".red().bold(),
-                &field_id.red().bold(),
-                "| CHECK YAML!:".red().bold(),
-                path
-            ),
-        }
-    }
-
-    pub fn validate_fasta(path: &str) {
-        let reader = fasta::reader::Builder.build_from_path(path);
-
-        let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS");
-        let result = binding.records();
-        let counter = result.count();
-        println!(
-            "{} {} {}",
-            ">- REFERENCE H/S PAIRS:".green(),
-            counter,
-            "H/S PAIRS".green()
-        )
-    }
-
-    pub fn validate_csv(path: &str) -> Result<(), Error> {
-        let file = File::open(path)?;
-
-        let mut reader = ReaderBuilder::new()
-            .has_headers(true)
-            .delimiter(b',')
-            .from_reader(file);
-
-        let record = reader.records().count();
-        println!(
-            "{} {} {}",
-            ">-GENESET-RECORD-COUNT: >".green(),
-            record,
-            "<".green()
-        );
-
-        Ok(())
-    }
-
-    //
-    // FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta
-    //           could make this much easier and consise by passing in a list of file types to check
-    //           validatedata(path, [fa, fna, fasta])
-    //
-    pub fn validate_data(path: &str, dtype: &str) {
-        match fs::read_dir(path) {
-            Err(e) if e.kind() == ErrorKind::NotFound => {}
-            Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()),
-            Ok(data_files) => {
-                if dtype == "pacbio" {
-                    let files: Vec<PathBuf> = data_files
-                        .filter_map(|f| f.ok())
-                        .filter(|d| match d.path().extension() {
-                            None => false,
-                            Some(ex) => ex == "fasta.gz",
-                        })
-                        .map(|f| f.path())
-                        .collect();
-
-                    if files.is_empty() {
-                        println!("{}", "<-NO PACBIO DATA FILES".red())
-                    } else {
-                        println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files);
-                    }
-                } else if dtype == "hic" {
-                    let files: Vec<PathBuf> = data_files
-                        .filter_map(|f| f.ok())
-                        .filter(|d| match d.path().extension() {
-                            None => false,
-                            Some(ex) => ex == "cram" || ex == "crai",
-                        })
-                        .map(|f| f.path())
-                        .collect();
-
-                    if files.is_empty() {
-                        println!("{}", "<-NO HIC DATA FILES".red())
-                    } else {
-                        println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files);
-                    }
-                } else if dtype == "synteny" {
-                    let files: Vec<PathBuf> = data_files
-                        .filter_map(|f| f.ok())
-                        .filter(|d| match d.path().extension() {
-                            None => false,
-                            Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna",
-                        })
-                        .map(|f| f.path())
-                        .collect();
-
-                    if files.is_empty() {
-                        println!("{}", "<-NO SYNTENIC GENOMES".red())
-                    } else {
-                        println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files);
-                    }
-                }
-            }
-        };
-    }
-
-    pub fn validate_yaml(arguments: std::option::Option<&ArgMatches>) {
-        let file = arguments.unwrap().get_one::<String>("yaml").unwrap();
-        let _output: &String = arguments
-            .unwrap()
-            .get_one::<String>("output-directory")
-            .unwrap();
-        let _verbose_flag: &bool = arguments.unwrap().get_one::<bool>("verbose").unwrap();
-
-        println! {"Validating Yaml: {}", file.purple()};
-
-        let input = fs::File::open(file).expect("Unable to read from file");
-        let contents: TreeValYaml =
-            serde_yaml::from_reader(input).expect("Unable to read from file");
-
-        println!(
-            "RUNNING VALIDATE-YAML FOR SAMPLE: {}",
-            contents.assembly.sample_id.purple()
-        );
-
-        validate_paths(&contents.reference_file, "REFERENCE");
-        validate_paths(&contents.alignment.data_dir, "GENESET");
-        validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY");
-        validate_paths(&contents.busco.lineages_path, "BUSCO");
-
-        validate_paths(&contents.assem_reads.pacbio, "PACBIO");
-        validate_data(&contents.assem_reads.pacbio, "pacbio");
-
-        validate_paths(&contents.assem_reads.hic, "HIC");
-        validate_data(&contents.assem_reads.hic, "hic");
-
-        println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue());
-        let genesets = contents.alignment.geneset.split(',');
-        for set in genesets {
-            let gene_alignment_path = contents.alignment.data_dir.clone()
-                + &contents.assembly.classT
-                + "/csv_data/"
-                + set
-                + "-data.csv";
-            validate_paths(&gene_alignment_path, "GENESET-CSV");
-        }
-
-        println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue());
-        let synteny_full =
-            contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/";
-        validate_paths(&synteny_full, "SYNTENY-FASTA");
-        validate_data(&synteny_full, "synteny");
-
-        println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue());
-        let busco_path =
-            contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage;
-        validate_paths(&busco_path, "BUSCO-DB");
-        // NOW CHECK FOR FILES IN DIRECTORY?
-
-        println!(
-            "{}\n{}\n{}\n{}\n{}",
-            "VALIDATION COMPLETE".purple().bold(),
-            "GENERAL INFORMATION:".purple().bold(),
-            "Check the log to see what failed".bold(),
-            "FULL : ONLY synteny fails are permitted".purple(),
-            "RAPID: geneset, busco and synteny fails are permitted".purple()
-        );
-    }
-}

From 6baebd18aeeae0d91aa154a996389ac4ab44c38a Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:27:47 +0100
Subject: [PATCH 02/30] Adding unit test structure

---
 src/processors/exclude_seq.rs    | 11 +++++++++++
 src/processors/map_headers.rs    | 11 +++++++++++
 src/processors/remap_head.rs     | 10 ++++++++++
 src/processors/split_by_count.rs | 11 +++++++++++
 src/processors/split_by_size.rs  | 10 ++++++++++
 src/processors/tpf_fasta.rs      | 11 +++++++++++
 src/processors/yaml_validator.rs | 11 +++++++++++
 7 files changed, 75 insertions(+)

diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs
index 89ecc8d..84251a2 100644
--- a/src/processors/exclude_seq.rs
+++ b/src/processors/exclude_seq.rs
@@ -38,3 +38,14 @@ pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) {
     let _x = open_fasta(list_to_exclude, fasta, outfile);
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
+
diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs
index 18a9847..3fa1101 100644
--- a/src/processors/map_headers.rs
+++ b/src/processors/map_headers.rs
@@ -129,3 +129,14 @@ pub fn map_fasta_head(
     Ok(())
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
+
diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs
index 44ef5d5..c4f53ae 100644
--- a/src/processors/remap_head.rs
+++ b/src/processors/remap_head.rs
@@ -64,3 +64,13 @@ pub fn remapping_head(file: &String, output: &String, map_file: &String) {
     };
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs
index dda0b39..a220b14 100644
--- a/src/processors/split_by_count.rs
+++ b/src/processors/split_by_count.rs
@@ -98,3 +98,14 @@ pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_
     write_fasta(&full_outpath, &record_list);
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
+
diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs
index 5a65719..c6452dd 100644
--- a/src/processors/split_by_size.rs
+++ b/src/processors/split_by_size.rs
@@ -8,3 +8,13 @@ pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory:
     );
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs
index f8863b3..9052b6f 100644
--- a/src/processors/tpf_fasta.rs
+++ b/src/processors/tpf_fasta.rs
@@ -270,3 +270,14 @@ pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output:
     })
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}
+
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
index d2a08f4..a738f2e 100644
--- a/src/processors/yaml_validator.rs
+++ b/src/processors/yaml_validator.rs
@@ -263,3 +263,14 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
         "RAPID: geneset, busco and synteny fails are permitted".purple()
     );
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = 2 + 2;
+        assert_eq!(result, 4);
+    }
+}

From dd524f6b593ee71ef765f75c113d65d7735d701e Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:32:42 +0100
Subject: [PATCH 03/30] Cargo format

---
 src/cli/mod.rs                   | 26 +++++-----
 src/main.rs                      | 86 +++++++++++++++++++-------------
 src/processors.rs                |  8 +--
 src/processors/exclude_seq.rs    |  1 -
 src/processors/map_headers.rs    | 16 ++----
 src/processors/remap_head.rs     |  1 -
 src/processors/split_by_count.rs |  9 +++-
 src/processors/split_by_size.rs  |  5 +-
 src/processors/tpf_fasta.rs      | 26 ++++------
 src/processors/yaml_validator.rs |  7 +--
 10 files changed, 94 insertions(+), 91 deletions(-)

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index 74d50ef..80dc8e5 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -1,5 +1,5 @@
-use clap::{Parser, Subcommand};
 use clap::builder::Str;
+use clap::{Parser, Subcommand};
 
 const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"];
 
@@ -10,7 +10,7 @@ pub struct Cli {
     // command is optional (TODO: Make this not optional)
     // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html#defaults
     #[command(subcommand)]
-    pub command: Option<Commands>
+    pub command: Option<Commands>,
 }
 
 // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html
@@ -27,11 +27,10 @@ pub enum Commands {
 
         // Output the log to file
         #[arg(short = 'o', long, default_value_t=String::from("./"))]
-        output: String
+        output: String,
     },
 
     SplitByCount {
-
         // A path to a valid fasta file.
         #[arg(short = 'f', long)]
         fasta_file: String,
@@ -42,7 +41,7 @@ pub enum Commands {
 
         // The data type of the input data
         #[arg(short = 'd', value_parser = clap::builder::PossibleValuesParser::new(SPLIT_OPTIONS))]
-        data_type: String ,
+        data_type: String,
 
         // Do we need to sanitise the headers of the input fasta
         #[arg(short = 's', value_parser = clap::value_parser!(bool))]
@@ -87,7 +86,7 @@ pub enum Commands {
         output_directory: String,
 
         #[arg(short = 'r', default_value_t = String::from("FMMH"))]
-        replace_with: String
+        replace_with: String,
     },
 
     ReMapHeaders {
@@ -101,7 +100,7 @@ pub enum Commands {
 
         // "The original mapped header field, a TSV of old-header, new-header
         #[arg(short = 'm', default_value_t = String::from("FMMH"))]
-        map_file: String
+        map_file: String,
     },
 
     #[command(version, about="Profile an input fasta file and return various statistics", long_about = None)]
@@ -112,7 +111,7 @@ pub enum Commands {
 
         // The input fasta file for profiling
         #[arg(short = 'o', long, default_value_t = String::from("FasMan-out"))]
-        output_dir: String
+        output_dir: String,
     },
 
     Curate {
@@ -133,7 +132,7 @@ pub enum Commands {
 
         // Length that the N (gap) string should be.
         #[arg(short, long, default_value_t = 200)]
-        n_length: usize
+        n_length: usize,
     },
 
     Subset {
@@ -147,7 +146,7 @@ pub enum Commands {
 
         // Percentage of the original file entries that should be retained
         #[arg(short = 'p', long, default_value_t = 50)]
-        percent: u16
+        percent: u16,
     },
 
     FilterFasta {
@@ -160,11 +159,10 @@ pub enum Commands {
         output: String,
 
         #[arg(short = 'l', long = "filter_list")]
-        filter_list: String
+        filter_list: String,
     },
 
     Mergehaps {
-
         // The input fasta file for re-organising
         #[arg(short = 'p', long)]
         fasta_1: String,
@@ -180,5 +178,5 @@ pub enum Commands {
         // Output file prefix
         #[arg(short = 'o', default_value_t = String::from("merged"))]
         output: String,
-    }
-}
\ No newline at end of file
+    },
+}
diff --git a/src/main.rs b/src/main.rs
index 0f635ac..a450cc0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,52 +12,70 @@ mod generics;
 //use crate::generics::validate_fasta;
 
 // Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html
-use crate::processors::yaml_validator::validate_yaml;
-use crate::processors::split_by_count::split_file_by_count;
-use crate::processors::split_by_size::split_file_by_size;
+use crate::processors::exclude_seq::filter_fasta;
 use crate::processors::map_headers::map_fasta_head;
 use crate::processors::remap_head::remapping_head;
+use crate::processors::split_by_count::split_file_by_count;
+use crate::processors::split_by_size::split_file_by_size;
 use crate::processors::tpf_fasta::curate_fasta;
-use crate::processors::exclude_seq::filter_fasta;
+use crate::processors::yaml_validator::validate_yaml;
 mod processors;
 
-
 fn main() -> Result<(), Error> {
-
     let cli = Cli::parse();
 
     match &cli.command {
-        Some(Commands::YamlValidator { yaml, verbose, output }) => {
-            validate_yaml(yaml, verbose, output)
-        },
-        Some(Commands::SplitByCount { fasta_file, output_directory, data_type, sanitise, count}) => {
-            split_file_by_count(
-                fasta_file, output_directory, data_type, sanitise, count
-            )
-        },
-        Some(Commands::SplitBySize { fasta_file, mem_size, output_directory }) => {
-            split_file_by_size(
-                fasta_file,
-                mem_size,
-                output_directory
-            )
-        },
-        Some(Commands::MapHeaders { fasta_file, output_directory, replace_with }) => {
-            _ = map_fasta_head(fasta_file, output_directory, replace_with)
-        },
-        Some(Commands::ReMapHeaders { fasta_file, output_directory, map_file }) => {
-            remapping_head(fasta_file, output_directory, map_file)
+        Some(Commands::YamlValidator {
+            yaml,
+            verbose,
+            output,
+        }) => validate_yaml(yaml, verbose, output),
+        Some(Commands::SplitByCount {
+            fasta_file,
+            output_directory,
+            data_type,
+            sanitise,
+            count,
+        }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count),
+        Some(Commands::SplitBySize {
+            fasta_file,
+            mem_size,
+            output_directory,
+        }) => split_file_by_size(fasta_file, mem_size, output_directory),
+        Some(Commands::MapHeaders {
+            fasta_file,
+            output_directory,
+            replace_with,
+        }) => _ = map_fasta_head(fasta_file, output_directory, replace_with),
+        Some(Commands::ReMapHeaders {
+            fasta_file,
+            output_directory,
+            map_file,
+        }) => remapping_head(fasta_file, output_directory, map_file),
+        Some(Commands::Curate {
+            fasta,
+            tpf,
+            sort,
+            output,
+            n_length,
+        }) => curate_fasta(fasta, tpf, sort, output, n_length),
+        Some(Commands::FilterFasta {
+            fasta,
+            output,
+            filter_list,
+        }) => filter_fasta(fasta, output, filter_list),
+        Some(Commands::GenesetCSVS { .. }) => {
+            todo!()
+        }
+        Some(Commands::Profile { .. }) => {
+            todo!()
         }
-        Some(Commands::Curate { fasta, tpf, sort, output, n_length}) => {
-            curate_fasta(fasta, tpf, sort, output, n_length)
+        Some(Commands::Subset { .. }) => {
+            todo!()
         }
-        Some(Commands::FilterFasta { fasta, output, filter_list }) => {
-            filter_fasta(fasta, output, filter_list)
+        Some(Commands::Mergehaps { .. }) => {
+            todo!()
         }
-        Some(Commands::GenesetCSVS { .. }) => { todo!() },
-        Some(Commands::Profile { .. }) => { todo!() }
-        Some(Commands::Subset { .. }) => { todo!() }
-        Some(Commands::Mergehaps { .. }) => { todo!() }
         None => {
             panic!("No command given!")
         }
diff --git a/src/processors.rs b/src/processors.rs
index 0c1ad7f..dc0f572 100644
--- a/src/processors.rs
+++ b/src/processors.rs
@@ -1,7 +1,7 @@
-pub mod yaml_validator;
-pub mod split_by_count;
-pub mod split_by_size;
+pub mod exclude_seq;
 pub mod map_headers;
 pub mod remap_head;
+pub mod split_by_count;
+pub mod split_by_size;
 pub mod tpf_fasta;
-pub mod exclude_seq;
+pub mod yaml_validator;
diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs
index 84251a2..6e04222 100644
--- a/src/processors/exclude_seq.rs
+++ b/src/processors/exclude_seq.rs
@@ -48,4 +48,3 @@ mod tests {
         assert_eq!(result, 4);
     }
 }
-
diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs
index 3fa1101..8cf09dd 100644
--- a/src/processors/map_headers.rs
+++ b/src/processors/map_headers.rs
@@ -44,10 +44,7 @@ pub fn create_mapping(
 
 pub fn save_mapping(
     output: &str,
-    mapped: Zip<
-        std::vec::IntoIter<std::string::String>,
-        std::vec::IntoIter<std::string::String>,
-    >,
+    mapped: Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>>,
 ) {
     let f: File = File::create(output).expect("Unable to create file");
     let mut f: BufWriter<File> = BufWriter::new(f);
@@ -62,10 +59,7 @@ pub fn save_mapping(
 pub fn create_mapped_fasta(
     input: &str,
     output: &str,
-    mapped: Zip<
-        std::vec::IntoIter<std::string::String>,
-        std::vec::IntoIter<std::string::String>,
-    >,
+    mapped: Zip<std::vec::IntoIter<std::string::String>, std::vec::IntoIter<std::string::String>>,
 ) {
     let file_reader: File = File::open(input).expect("CAN'T OPEN FILE");
     let buff_reader: BufReader<File> = BufReader::new(file_reader);
@@ -91,9 +85,10 @@ pub fn create_mapped_fasta(
 }
 
 pub fn map_fasta_head(
-    file: &String, output: &String, replacer: &String
+    file: &String,
+    output: &String,
+    replacer: &String,
 ) -> Result<(), Box<dyn Error>> {
-
     println!("Mapping headers for file: {}", file);
     println!("Replace headers with string: {:?}", &replacer);
 
@@ -139,4 +134,3 @@ mod tests {
         assert_eq!(result, 4);
     }
 }
-
diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs
index c4f53ae..02cc633 100644
--- a/src/processors/remap_head.rs
+++ b/src/processors/remap_head.rs
@@ -38,7 +38,6 @@ pub fn pull_map_from_tsv(
 }
 
 pub fn remapping_head(file: &String, output: &String, map_file: &String) {
-
     println!("Mapping headers for file: {}", file);
     println!("Replace headers with string: {}", map_file);
 
diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs
index a220b14..43bfa53 100644
--- a/src/processors/split_by_count.rs
+++ b/src/processors/split_by_count.rs
@@ -37,7 +37,13 @@ fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
     }
 }
 
-pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_type: &String, sanitise: &bool, fasta_count: &u16) {
+pub fn split_file_by_count(
+    fasta_file: &String,
+    output_directory: &String,
+    data_type: &String,
+    sanitise: &bool,
+    fasta_count: &u16,
+) {
     let path_obj = Path::new(fasta_file);
     let grab_name = path_obj.file_name().unwrap();
     let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
@@ -108,4 +114,3 @@ mod tests {
         assert_eq!(result, 4);
     }
 }
-
diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs
index c6452dd..1bf3e27 100644
--- a/src/processors/split_by_size.rs
+++ b/src/processors/split_by_size.rs
@@ -2,10 +2,7 @@ use clap::ArgMatches;
 
 pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) {
     println!("Fasta file for processing: {:?}", &fasta_file);
-    println!(
-        "Size to chunk fasta into: {:?}",
-        mem_size
-    );
+    println!("Size to chunk fasta into: {:?}", mem_size);
 }
 
 #[cfg(test)]
diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs
index 9052b6f..e8c292b 100644
--- a/src/processors/tpf_fasta.rs
+++ b/src/processors/tpf_fasta.rs
@@ -1,6 +1,6 @@
-use std::{fs::File, fs::read_to_string, str};
 use std::fs::OpenOptions;
 use std::io::Write;
+use std::{fs::read_to_string, fs::File, str};
 
 use noodles::core::Position;
 use noodles::fasta;
@@ -61,10 +61,7 @@ fn parse_tpf(path: &String) -> Vec<Tpf> {
     all_tpf
 }
 
-fn subset_vec_tpf<'a>(
-    tpf: &'a Vec<Tpf>,
-    fasta: (&std::string::String, &usize),
-) -> Vec<&'a Tpf> {
+fn subset_vec_tpf<'a>(tpf: &'a Vec<Tpf>, fasta: (&std::string::String, &usize)) -> Vec<&'a Tpf> {
     //
     // Subset the Vec<TPF> based on a search through the fasta
     //
@@ -136,12 +133,7 @@ fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
     uniques
 }
 
-fn save_to_fasta(
-    fasta_data: Vec<NewFasta>,
-    tpf_data: Vec<Tpf>,
-    output: &String,
-    n_length: usize,
-) {
+fn save_to_fasta(fasta_data: Vec<NewFasta>, tpf_data: Vec<Tpf>, output: &String, n_length: usize) {
     //
     // TPF is in the input TPF order, this will continue to be the case until
     // the script is modified and the Tpf struct gets modified in place for some reason
@@ -215,7 +207,13 @@ fn save_to_fasta(
 
 #[allow(clippy::needless_borrow)]
 #[allow(clippy::let_and_return)]
-pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: &String, n_length: &usize) {
+pub fn curate_fasta(
+    fasta_file: &String,
+    tpf_file: &String,
+    sort: &bool,
+    output: &String,
+    n_length: &usize,
+) {
     //
     // Generate a curated fasta file based on the input TPF file
     // which was generated by Pretext and the agp_to_tpf script.
@@ -234,8 +232,7 @@ pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output:
                 // Start indexed reader of the input fasta
                 // if valid then use the data
                 //
-                let reader =
-                    fasta::indexed_reader::Builder::default().build_from_path(fasta_file);
+                let reader = fasta::indexed_reader::Builder::default().build_from_path(fasta_file);
                 let fasta_repo = match reader {
                     Ok(data) => {
                         let adapter = IndexedReader::new(data);
@@ -280,4 +277,3 @@ mod tests {
         assert_eq!(result, 4);
     }
 }
-
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
index a738f2e..bf3abac 100644
--- a/src/processors/yaml_validator.rs
+++ b/src/processors/yaml_validator.rs
@@ -208,12 +208,10 @@ pub fn validate_data(path: &str, dtype: &str) {
 }
 
 pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
-
     println! {"Validating Yaml: {}", file.purple()};
 
     let input = fs::File::open(file).expect("Unable to read from file");
-    let contents: TreeValYaml =
-        serde_yaml::from_reader(input).expect("Unable to read from file");
+    let contents: TreeValYaml = serde_yaml::from_reader(input).expect("Unable to read from file");
 
     println!(
         "RUNNING VALIDATE-YAML FOR SAMPLE: {}",
@@ -249,8 +247,7 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
     validate_data(&synteny_full, "synteny");
 
     println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue());
-    let busco_path =
-        contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage;
+    let busco_path = contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage;
     validate_paths(&busco_path, "BUSCO-DB");
     // NOW CHECK FOR FILES IN DIRECTORY?
 

From d8f1b33db7aff2eb60370c3e91d8255e3331dc85 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:35:58 +0100
Subject: [PATCH 04/30] Clippy fixes

---
 src/cli/mod.rs |  2 +-
 src/main.rs    | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index 80dc8e5..e89e328 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -1,5 +1,5 @@
-use clap::builder::Str;
 use clap::{Parser, Subcommand};
+use clap::builder::Str;
 
 const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"];
 
diff --git a/src/main.rs b/src/main.rs
index a450cc0..bad5f7f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,11 @@
 #![allow(non_snake_case)]
 
-use clap::{command, Arg, Command, Parser};
-use colored::Colorize;
-use std::env;
 use std::io::Error;
 
-mod cli;
-use cli::{Cli, Commands};
+use clap::Parser;
+use colored::Colorize;
 
-mod generics;
-//use crate::generics::validate_fasta;
+use cli::{Cli, Commands};
 
 // Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html
 use crate::processors::exclude_seq::filter_fasta;
@@ -19,6 +15,11 @@ use crate::processors::split_by_count::split_file_by_count;
 use crate::processors::split_by_size::split_file_by_size;
 use crate::processors::tpf_fasta::curate_fasta;
 use crate::processors::yaml_validator::validate_yaml;
+
+mod cli;
+mod generics;
+//use crate::generics::validate_fasta;
+
 mod processors;
 
 fn main() -> Result<(), Error> {

From ae55838106e3c51727d9edbb7ef00496393c5d44 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:36:21 +0100
Subject: [PATCH 05/30] Clippy fixes

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index ea8c4bf..2a0038a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+.idea
\ No newline at end of file

From 06129eadbfd2f4da72e3a3233d505a5252a2d34a Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:37:39 +0100
Subject: [PATCH 06/30] Clippy fixes

---
 src/cli/mod.rs                   | 1 -
 src/main.rs                      | 1 -
 src/processors/exclude_seq.rs    | 2 --
 src/processors/map_headers.rs    | 2 --
 src/processors/remap_head.rs     | 1 -
 src/processors/split_by_count.rs | 3 +--
 src/processors/split_by_size.rs  | 5 +----
 src/processors/tpf_fasta.rs      | 3 +--
 src/processors/yaml_validator.rs | 3 +--
 9 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index e89e328..2638ee9 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -1,5 +1,4 @@
 use clap::{Parser, Subcommand};
-use clap::builder::Str;
 
 const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"];
 
diff --git a/src/main.rs b/src/main.rs
index bad5f7f..d571da8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,7 +3,6 @@
 use std::io::Error;
 
 use clap::Parser;
-use colored::Colorize;
 
 use cli::{Cli, Commands};
 
diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs
index 6e04222..7866df4 100644
--- a/src/processors/exclude_seq.rs
+++ b/src/processors/exclude_seq.rs
@@ -1,4 +1,3 @@
-use clap::ArgMatches;
 use noodles::fasta;
 use std::error::Error;
 use std::{fs, io::BufRead, str};
@@ -40,7 +39,6 @@ pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs
index 8cf09dd..9c3390f 100644
--- a/src/processors/map_headers.rs
+++ b/src/processors/map_headers.rs
@@ -1,4 +1,3 @@
-use clap::ArgMatches;
 use colored::Colorize;
 use std::error::Error;
 use std::fmt;
@@ -126,7 +125,6 @@ pub fn map_fasta_head(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs
index 02cc633..ca30d10 100644
--- a/src/processors/remap_head.rs
+++ b/src/processors/remap_head.rs
@@ -65,7 +65,6 @@ pub fn remapping_head(file: &String, output: &String, map_file: &String) {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs
index 43bfa53..35ba1fe 100644
--- a/src/processors/split_by_count.rs
+++ b/src/processors/split_by_count.rs
@@ -1,5 +1,5 @@
 use crate::generics::sanitise_header;
-use clap::ArgMatches;
+
 use compare::{natural, Compare};
 use noodles::fasta::{self, Record};
 use std::cmp::Ordering;
@@ -106,7 +106,6 @@ pub fn split_file_by_count(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs
index 1bf3e27..8c126dc 100644
--- a/src/processors/split_by_size.rs
+++ b/src/processors/split_by_size.rs
@@ -1,13 +1,10 @@
-use clap::ArgMatches;
-
-pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) {
+pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &String) {
     println!("Fasta file for processing: {:?}", &fasta_file);
     println!("Size to chunk fasta into: {:?}", mem_size);
 }
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs
index e8c292b..0453798 100644
--- a/src/processors/tpf_fasta.rs
+++ b/src/processors/tpf_fasta.rs
@@ -210,7 +210,7 @@ fn save_to_fasta(fasta_data: Vec<NewFasta>, tpf_data: Vec<Tpf>, output: &String,
 pub fn curate_fasta(
     fasta_file: &String,
     tpf_file: &String,
-    sort: &bool,
+    _sort: &bool,
     output: &String,
     n_length: &usize,
 ) {
@@ -269,7 +269,6 @@ pub fn curate_fasta(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
index bf3abac..0661e45 100644
--- a/src/processors/yaml_validator.rs
+++ b/src/processors/yaml_validator.rs
@@ -207,7 +207,7 @@ pub fn validate_data(path: &str, dtype: &str) {
     };
 }
 
-pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
+pub fn validate_yaml(file: &String, _verbose: &bool, _output: &String) {
     println! {"Validating Yaml: {}", file.purple()};
 
     let input = fs::File::open(file).expect("Unable to read from file");
@@ -263,7 +263,6 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
 
     #[test]
     fn it_works() {

From 011b5394033c5b309ee699816b757d5a5cd77569 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:40:16 +0100
Subject: [PATCH 07/30] Clippy fixes

---
 src/processors/exclude_seq.rs    | 2 +-
 src/processors/split_by_size.rs  | 2 +-
 src/processors/yaml_validator.rs | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs
index 7866df4..1b0b8c1 100644
--- a/src/processors/exclude_seq.rs
+++ b/src/processors/exclude_seq.rs
@@ -32,7 +32,7 @@ fn open_fasta<'a>(
     }
 }
 
-pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) {
+pub fn filter_fasta(fasta: &str, outfile: &str, exclude: &str) {
     let list_to_exclude = exclude.split(',').collect::<Vec<&str>>();
     let _x = open_fasta(list_to_exclude, fasta, outfile);
 }
diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs
index 8c126dc..6445afd 100644
--- a/src/processors/split_by_size.rs
+++ b/src/processors/split_by_size.rs
@@ -1,4 +1,4 @@
-pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &String) {
+pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &str) {
     println!("Fasta file for processing: {:?}", &fasta_file);
     println!("Size to chunk fasta into: {:?}", mem_size);
 }
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
index 0661e45..f7a5172 100644
--- a/src/processors/yaml_validator.rs
+++ b/src/processors/yaml_validator.rs
@@ -207,7 +207,7 @@ pub fn validate_data(path: &str, dtype: &str) {
     };
 }
 
-pub fn validate_yaml(file: &String, _verbose: &bool, _output: &String) {
+pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) {
     println! {"Validating Yaml: {}", file.purple()};
 
     let input = fs::File::open(file).expect("Unable to read from file");

From c5b4ef97926460c618ae59ab4ae2fd7067f4e729 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 24 May 2024 17:43:12 +0100
Subject: [PATCH 08/30] Readme update (badge added)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 79c7001..6c478a2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # FastaManipulator
 
+![img](https://github.com/Rust-Wellcome/FasMan/actions/workflows/release-repo.yml/badge.svg)
+
 This is a re-write of the current fasta manipulation scripts I've written whilst at ToL, as well as adding some functionality needed for future projects.
 
 Currently, this program has the following arguments:

From bdf8cd2442a4982cc51e59cc63909ceaa20f3c07 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sat, 25 May 2024 19:29:13 +0100
Subject: [PATCH 09/30] Updating module structure.

---
 src/{processors.rs => processors/mod.rs} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/{processors.rs => processors/mod.rs} (100%)

diff --git a/src/processors.rs b/src/processors/mod.rs
similarity index 100%
rename from src/processors.rs
rename to src/processors/mod.rs

From b1423b0f8e57d13a8a6971eea8038535bc78b08d Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sat, 25 May 2024 23:31:11 +0100
Subject: [PATCH 10/30] Adding file read func

---
 src/file_utils/file_utility.rs | 67 ++++++++++++++++++++++++++++++++++
 src/file_utils/mod.rs          |  1 +
 src/main.rs                    |  1 +
 3 files changed, 69 insertions(+)
 create mode 100644 src/file_utils/file_utility.rs
 create mode 100644 src/file_utils/mod.rs

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
new file mode 100644
index 0000000..4cc5424
--- /dev/null
+++ b/src/file_utils/file_utility.rs
@@ -0,0 +1,67 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+
+use clap::Error;
+
+struct Records {
+    lines: Vec<String>
+}
+
+struct FileReader {
+    buffer: Vec<String> // TODO: Make use of this internal buffer.
+}
+
+pub trait Default {
+    fn default() -> Self;
+}
+
+impl Default for FileReader {
+    fn default() -> Self {
+        FileReader {
+            buffer: Vec::<String>::new()
+        }
+    }
+}
+
+impl FileReader {
+
+    /*
+     * Reads a specific number of lines from a file
+     */
+    pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result<Records, Error> {
+        let file = File::open(file_path)?;
+        let reader = BufReader::new(file);
+        // This buffer will be stored in heap, and will popped off when read_file function goes out of scope.
+        let mut internal_buffer = vec![];   
+
+        // Error unwrapping: https://tinyurl.com/brt9fphk
+        // take() function https://tinyurl.com/6vx7m3k6
+        for line in reader.lines().take(num_lines) {
+            let result = line.expect("Error in reading file"); // This will panic if errored
+            internal_buffer.push(result);
+        };
+
+        Ok(Records { lines: internal_buffer })
+    }
+
+}
+
+#[cfg(test)]
+mod tests {
+    use core::panic;
+
+    use super::*;
+
+    #[test]
+    fn read_first_line() {
+        let mut fileReader = FileReader::default();
+        match fileReader.read_file("test_data/synthetic/tiny.fa", 3) {
+            Ok(records) => {
+                assert_eq!(3, records.lines.len())
+            }
+            Err(error) => {
+                panic!("{:?}", error)
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/file_utils/mod.rs b/src/file_utils/mod.rs
new file mode 100644
index 0000000..6f2644c
--- /dev/null
+++ b/src/file_utils/mod.rs
@@ -0,0 +1 @@
+pub mod file_utility;
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index d571da8..01e7267 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,6 +15,7 @@ use crate::processors::split_by_size::split_file_by_size;
 use crate::processors::tpf_fasta::curate_fasta;
 use crate::processors::yaml_validator::validate_yaml;
 
+mod file_utils;
 mod cli;
 mod generics;
 //use crate::generics::validate_fasta;

From 4cb033a9e69cc49a45334ee12e82315c4131bfc4 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sat, 25 May 2024 23:40:07 +0100
Subject: [PATCH 11/30] Adding start and end pointers

---
 src/file_utils/file_utility.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 4cc5424..a7062e5 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -3,14 +3,19 @@ use std::io::{BufRead, BufReader};
 
 use clap::Error;
 
+#[allow(dead_code)]
 struct Records {
     lines: Vec<String>
 }
 
+#[allow(dead_code)]
 struct FileReader {
-    buffer: Vec<String> // TODO: Make use of this internal buffer.
+    buffer: Vec<String>, // TODO: Make use of this internal buffer.
+    startPtr: u16,       // TODO: Use these pointers to read data chunks
+    endPtr: u16,
 }
 
+#[allow(dead_code)]
 pub trait Default {
     fn default() -> Self;
 }
@@ -18,7 +23,9 @@ pub trait Default {
 impl Default for FileReader {
     fn default() -> Self {
         FileReader {
-            buffer: Vec::<String>::new()
+            buffer: Vec::<String>::new(),
+            startPtr: 0,
+            endPtr: 0,
         }
     }
 }

From f8ffdba436ac05426a70515ebf7c4b6d63370401 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sat, 25 May 2024 23:42:18 +0100
Subject: [PATCH 12/30] TODO docs

---
 src/file_utils/file_utility.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index a7062e5..f5d1207 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -34,6 +34,7 @@ impl FileReader {
 
     /*
      * Reads a specific number of lines from a file
+     * TODO: Propogate errors: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html#propagating-errors 
      */
     pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result<Records, Error> {
         let file = File::open(file_path)?;

From dcc4ed28419aff2b75f425f193b5b59677b99023 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sat, 25 May 2024 23:43:33 +0100
Subject: [PATCH 13/30] Ptr unit change

---
 src/file_utils/file_utility.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index f5d1207..9ba1627 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -11,8 +11,8 @@ struct Records {
 #[allow(dead_code)]
 struct FileReader {
     buffer: Vec<String>, // TODO: Make use of this internal buffer.
-    startPtr: u16,       // TODO: Use these pointers to read data chunks
-    endPtr: u16,
+    startPtr: usize,       // TODO: Use these pointers to read data chunks
+    endPtr: usize,
 }
 
 #[allow(dead_code)]

From ddc8414ba4a413a17ebdebd62e68ddbed40a0a04 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 10:58:16 +0100
Subject: [PATCH 14/30] Implementing batch-wise file-readl

---
 Cargo.lock                     |  16 ++++++
 Cargo.toml                     |   1 +
 src/file_utils/file_utility.rs | 101 +++++++++++++++++++++------------
 src/file_utils/mod.rs          |   2 +-
 src/main.rs                    |   2 +-
 5 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a09d981..b1679d9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -375,6 +375,12 @@ dependencies = [
  "crypto-common",
 ]
 
+[[package]]
+name = "either"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
+
 [[package]]
 name = "enum-iterator"
 version = "1.5.0"
@@ -431,6 +437,7 @@ dependencies = [
  "compare",
  "csv",
  "io",
+ "itertools",
  "noodles",
  "regex",
  "serde",
@@ -710,6 +717,15 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.9"
diff --git a/Cargo.toml b/Cargo.toml
index b6f6872..2cda670 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ colored = "2.0.4"
 compare = "0.1.0"
 csv = "1.3.0"
 io = "0.0.2"
+itertools = "0.13.0"
 noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }
 regex = "1.9.5"
 serde = { version = "1.0.188", features = ["derive"] }
diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 9ba1627..cacd85d 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -2,74 +2,103 @@ use std::fs::File;
 use std::io::{BufRead, BufReader};
 
 use clap::Error;
+use itertools::Itertools;
 
 #[allow(dead_code)]
-struct Records {
-    lines: Vec<String>
+struct Records<T> {
+    lines: Vec<T>,
 }
 
-#[allow(dead_code)]
-struct FileReader {
-    buffer: Vec<String>, // TODO: Make use of this internal buffer.
-    startPtr: usize,       // TODO: Use these pointers to read data chunks
-    endPtr: usize,
+impl Records<String> {
+    pub fn size(&self) -> usize {
+        self.lines.len()
+    }
 }
 
 #[allow(dead_code)]
-pub trait Default {
+struct BatchFileReader {}
+
+#[allow(dead_code)]
+pub trait DefaultReader {
     fn default() -> Self;
 }
 
-impl Default for FileReader {
+impl DefaultReader for BatchFileReader {
     fn default() -> Self {
-        FileReader {
-            buffer: Vec::<String>::new(),
-            startPtr: 0,
-            endPtr: 0,
-        }
+        BatchFileReader {}
     }
 }
 
-impl FileReader {
-
+impl BatchFileReader {
     /*
-     * Reads a specific number of lines from a file
-     * TODO: Propogate errors: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html#propagating-errors 
+     * Reads a specific number of lines from a file from the top
      */
-    pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result<Records, Error> {
+    pub fn read_lines(
+        &mut self,
+        file_path: &str,
+        num_lines: usize,
+    ) -> Result<Records<String>, Error> {
         let file = File::open(file_path)?;
         let reader = BufReader::new(file);
-        // This buffer will be stored in heap, and will popped off when read_file function goes out of scope.
-        let mut internal_buffer = vec![];   
+        let mut internal_buffer = Vec::<String>::new();
 
         // Error unwrapping: https://tinyurl.com/brt9fphk
         // take() function https://tinyurl.com/6vx7m3k6
         for line in reader.lines().take(num_lines) {
             let result = line.expect("Error in reading file"); // This will panic if errored
-            internal_buffer.push(result);
-        };
+            internal_buffer.push(result.clone())
+        }
 
-        Ok(Records { lines: internal_buffer })
+        Ok(Records {
+            lines: internal_buffer,
+        })
     }
 
+    /**
+     * Reads a file batch by batch, and applies a function Fn for each chunk
+     */
+    pub fn read_file_by_batch(
+        &mut self,
+        file_path: &str,
+        batch_size: usize,
+        f: &dyn Fn(Records<String>) -> (),
+    ) -> Result<(), Error> {
+        let file = File::open(file_path)?;
+        let reader = BufReader::new(file);
+
+        for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) {
+            f(Records {
+                lines: chunk.collect(),
+            });
+        }
+
+        Ok(())
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use core::panic;
 
     use super::*;
 
+    const TEST_FILE_PATH: &str = "test_data/synthetic/tiny.fa";
+
     #[test]
-    fn read_first_line() {
-        let mut fileReader = FileReader::default();
-        match fileReader.read_file("test_data/synthetic/tiny.fa", 3) {
-            Ok(records) => {
-                assert_eq!(3, records.lines.len())
-            }
-            Err(error) => {
-                panic!("{:?}", error)
-            }
-        }
+    fn read_lines() {
+        let mut BatchFileReader = BatchFileReader::default();
+        let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap();
+        assert_eq!(3, records.lines.len());
     }
-}
\ No newline at end of file
+
+    fn print_function(input: Records<String>) -> () {
+        assert_eq!(true, input.size() <= 3);
+    }
+
+    #[test]
+    fn read_file_batch() {
+        let mut BatchFileReader = BatchFileReader::default();
+        BatchFileReader
+            .read_file_by_batch(TEST_FILE_PATH, 3, &print_function)
+            .unwrap();
+    }
+}
diff --git a/src/file_utils/mod.rs b/src/file_utils/mod.rs
index 6f2644c..79b7217 100644
--- a/src/file_utils/mod.rs
+++ b/src/file_utils/mod.rs
@@ -1 +1 @@
-pub mod file_utility;
\ No newline at end of file
+pub mod file_utility;
diff --git a/src/main.rs b/src/main.rs
index 01e7267..1568b0b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,8 +15,8 @@ use crate::processors::split_by_size::split_file_by_size;
 use crate::processors::tpf_fasta::curate_fasta;
 use crate::processors::yaml_validator::validate_yaml;
 
-mod file_utils;
 mod cli;
+mod file_utils;
 mod generics;
 //use crate::generics::validate_fasta;
 

From 0904ef1e4ab8ed99b29fea958fe81d5d196425a0 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 11:05:29 +0100
Subject: [PATCH 15/30] Adding documentation

---
 src/file_utils/file_utility.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index cacd85d..8a0d1d3 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -56,6 +56,8 @@ impl BatchFileReader {
 
     /**
      * Reads a file batch by batch, and applies a function Fn for each chunk
+     * Function pointers documentation: https://doc.rust-lang.org/book/ch19-05-advanced-functions-and-closures.html#function-pointers
+     * f is a closure pushed into the stack of read_file_by_batch that is similar to an anonymous function in Java/JavaScript/C#
      */
     pub fn read_file_by_batch(
         &mut self,
@@ -66,6 +68,8 @@ impl BatchFileReader {
         let file = File::open(file_path)?;
         let reader = BufReader::new(file);
 
+        // map_while() Creates an iterator that both yields elements based on a predicate and maps.
+        // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while
         for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) {
             f(Records {
                 lines: chunk.collect(),
@@ -90,6 +94,8 @@ mod tests {
         assert_eq!(3, records.lines.len());
     }
 
+    // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context.
+    // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html
     fn print_function(input: Records<String>) -> () {
         assert_eq!(true, input.size() <= 3);
     }

From 28d05ca4cd789cce47989101e33429ccbad7cf33 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 11:12:25 +0100
Subject: [PATCH 16/30] Added new recordline type

---
 src/file_utils/file_utility.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 8a0d1d3..bd2b0d8 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -4,6 +4,8 @@ use std::io::{BufRead, BufReader};
 use clap::Error;
 use itertools::Itertools;
 
+struct RecordLine(String);
+
 #[allow(dead_code)]
 struct Records<T> {
     lines: Vec<T>,

From 37d3bdfcefa9cec2bb74aee61b7adad36e99cdd9 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 11:24:28 +0100
Subject: [PATCH 17/30] Clippy fixes

---
 src/file_utils/file_utility.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index bd2b0d8..f8c4ddd 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -65,7 +65,7 @@ impl BatchFileReader {
         &mut self,
         file_path: &str,
         batch_size: usize,
-        f: &dyn Fn(Records<String>) -> (),
+        f: &dyn Fn(Records<String>),
     ) -> Result<(), Error> {
         let file = File::open(file_path)?;
         let reader = BufReader::new(file);
@@ -98,8 +98,8 @@ mod tests {
 
     // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context.
     // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html
-    fn print_function(input: Records<String>) -> () {
-        assert_eq!(true, input.size() <= 3);
+    fn print_function(input: Records<String>) {
+        assert!(input.size() <= 3);
     }
 
     #[test]

From 0583492591d66b262e702b2b9754af6cc87f3d74 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 11:28:19 +0100
Subject: [PATCH 18/30] Allowing dead code (until they are used in upstream ilb
 calls)

---
 src/file_utils/file_utility.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index f8c4ddd..e12e74c 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -4,6 +4,7 @@ use std::io::{BufRead, BufReader};
 use clap::Error;
 use itertools::Itertools;
 
+#[allow(dead_code)] 
 struct RecordLine(String);
 
 #[allow(dead_code)]
@@ -11,6 +12,7 @@ struct Records<T> {
     lines: Vec<T>,
 }
 
+#[allow(dead_code)] 
 impl Records<String> {
     pub fn size(&self) -> usize {
         self.lines.len()
@@ -31,6 +33,7 @@ impl DefaultReader for BatchFileReader {
     }
 }
 
+#[allow(dead_code)] 
 impl BatchFileReader {
     /*
      * Reads a specific number of lines from a file from the top

From b6bcb9c8688a3985cd8d2308629c733110faf1a5 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 11:30:06 +0100
Subject: [PATCH 19/30] Cargo fmt run

---
 src/file_utils/file_utility.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index e12e74c..44fe0c9 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -4,7 +4,7 @@ use std::io::{BufRead, BufReader};
 use clap::Error;
 use itertools::Itertools;
 
-#[allow(dead_code)] 
+#[allow(dead_code)]
 struct RecordLine(String);
 
 #[allow(dead_code)]
@@ -12,7 +12,7 @@ struct Records<T> {
     lines: Vec<T>,
 }
 
-#[allow(dead_code)] 
+#[allow(dead_code)]
 impl Records<String> {
     pub fn size(&self) -> usize {
         self.lines.len()
@@ -33,7 +33,7 @@ impl DefaultReader for BatchFileReader {
     }
 }
 
-#[allow(dead_code)] 
+#[allow(dead_code)]
 impl BatchFileReader {
     /*
      * Reads a specific number of lines from a file from the top

From 034abee3c1584e5459498ad39f1f10681b6c7a30 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 20:22:57 +0100
Subject: [PATCH 20/30] Adding some error handling logic and logging

---
 Cargo.lock                     |  7 ++++++
 Cargo.toml                     |  1 +
 src/errors/file_error.rs       | 26 ++++++++++++++++++++++
 src/errors/mod.rs              |  1 +
 src/file_utils/file_utility.rs | 40 ++++++++++++++++++++++++++--------
 src/main.rs                    |  1 +
 6 files changed, 67 insertions(+), 9 deletions(-)
 create mode 100644 src/errors/file_error.rs
 create mode 100644 src/errors/mod.rs

diff --git a/Cargo.lock b/Cargo.lock
index b1679d9..6868692 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -438,6 +438,7 @@ dependencies = [
  "csv",
  "io",
  "itertools",
+ "log",
  "noodles",
  "regex",
  "serde",
@@ -823,6 +824,12 @@ version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
 
+[[package]]
+name = "log"
+version = "0.4.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+
 [[package]]
 name = "lzma-sys"
 version = "0.1.20"
diff --git a/Cargo.toml b/Cargo.toml
index 2cda670..2f4477f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,3 +17,4 @@ regex = "1.9.5"
 serde = { version = "1.0.188", features = ["derive"] }
 serde_yaml = "0.9.25"
 stacker = "0.1.15"
+log = "0.4.21"
diff --git a/src/errors/file_error.rs b/src/errors/file_error.rs
new file mode 100644
index 0000000..5e978da
--- /dev/null
+++ b/src/errors/file_error.rs
@@ -0,0 +1,26 @@
+use std::fmt::{self};
+
+use std::io::Error;
+
+// Define our error types. These may be customized for our error handling cases.
+// Now we will be able to write our own errors, defer to an underlying error
+// implementation, or do something in between.
+// Resource: https://doc.rust-lang.org/rust-by-example/error/multiple_error_types/define_error_type.html
+#[derive(Debug, Clone)]
+pub struct FileError {
+    message: String,
+}
+
+impl fmt::Display for FileError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Error in handling the file.")
+    }
+}
+
+impl From<Error> for FileError {
+    fn from(error: Error) -> Self {
+        FileError {
+            message: format!("{}", error),
+        }
+    }
+}
diff --git a/src/errors/mod.rs b/src/errors/mod.rs
new file mode 100644
index 0000000..6bf812b
--- /dev/null
+++ b/src/errors/mod.rs
@@ -0,0 +1 @@
+pub mod file_error;
diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 44fe0c9..4c544dd 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -1,7 +1,8 @@
+use log::{info};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 
-use clap::Error;
+use crate::errors::file_error::FileError;
 use itertools::Itertools;
 
 #[allow(dead_code)]
@@ -42,9 +43,19 @@ impl BatchFileReader {
         &mut self,
         file_path: &str,
         num_lines: usize,
-    ) -> Result<Records<String>, Error> {
-        let file = File::open(file_path)?;
-        let reader = BufReader::new(file);
+    ) -> Result<Records<String>, FileError> {
+        info!("Reading lines in file.");
+        let file = File::open(file_path);
+
+        let result = match file {
+            Ok(file) => file,
+            Err(error) => {
+                info!("Error in file handler: {:?}", error);
+                return Err(error.into());
+            }
+        };
+
+        let reader = BufReader::new(result);
         let mut internal_buffer = Vec::<String>::new();
 
         // Error unwrapping: https://tinyurl.com/brt9fphk
@@ -69,9 +80,20 @@ impl BatchFileReader {
         file_path: &str,
         batch_size: usize,
         f: &dyn Fn(Records<String>),
-    ) -> Result<(), Error> {
-        let file = File::open(file_path)?;
-        let reader = BufReader::new(file);
+    ) -> Result<(), FileError> {
+        info!("Reading file by chunk.");
+
+        let file = File::open(file_path);
+
+        let result = match file {
+            Ok(file) => file,
+            Err(error) => {
+                info!("Error in file handler: {:?}", error);
+                return Err(error.into());
+            }
+        };
+
+        let reader = BufReader::new(result);
 
         // map_while() Creates an iterator that both yields elements based on a predicate and maps.
         // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while
@@ -101,7 +123,7 @@ mod tests {
 
     // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context.
     // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html
-    fn print_function(input: Records<String>) {
+    fn assert_function(input: Records<String>) {
         assert!(input.size() <= 3);
     }
 
@@ -109,7 +131,7 @@ mod tests {
     fn read_file_batch() {
         let mut BatchFileReader = BatchFileReader::default();
         BatchFileReader
-            .read_file_by_batch(TEST_FILE_PATH, 3, &print_function)
+            .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function)
             .unwrap();
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 1568b0b..d66d003 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -16,6 +16,7 @@ use crate::processors::tpf_fasta::curate_fasta;
 use crate::processors::yaml_validator::validate_yaml;
 
 mod cli;
+mod errors;
 mod file_utils;
 mod generics;
 //use crate::generics::validate_fasta;

From c4ed2de31c16f9f93607581be5e0cba815408330 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 20:24:45 +0100
Subject: [PATCH 21/30] Fmt fixes

---
 src/file_utils/file_utility.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 4c544dd..b87000d 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -1,4 +1,4 @@
-use log::{info};
+use log::info;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 

From e436eff43388335ab8a81c7d55a30ffa6c0d8dab Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Sun, 26 May 2024 20:25:52 +0100
Subject: [PATCH 22/30] Allowing dead code until it's used.

---
 src/errors/file_error.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/errors/file_error.rs b/src/errors/file_error.rs
index 5e978da..77ec0f8 100644
--- a/src/errors/file_error.rs
+++ b/src/errors/file_error.rs
@@ -7,6 +7,7 @@ use std::io::Error;
 // implementation, or do something in between.
 // Resource: https://doc.rust-lang.org/rust-by-example/error/multiple_error_types/define_error_type.html
 #[derive(Debug, Clone)]
+#[allow(dead_code)]
 pub struct FileError {
     message: String,
 }

From 7de11f2a031f995d771744bd9c9bb435f1c2e7a8 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Mon, 27 May 2024 08:30:25 +0100
Subject: [PATCH 23/30] Refactoring items

---
 src/file_utils/file_utility.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index b87000d..8a58c01 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -10,13 +10,13 @@ struct RecordLine(String);
 
 #[allow(dead_code)]
 struct Records<T> {
-    lines: Vec<T>,
+    items: Vec<T>,
 }
 
 #[allow(dead_code)]
 impl Records<String> {
     pub fn size(&self) -> usize {
-        self.lines.len()
+        self.items.len()
     }
 }
 
@@ -66,7 +66,7 @@ impl BatchFileReader {
         }
 
         Ok(Records {
-            lines: internal_buffer,
+            items: internal_buffer,
         })
     }
 
@@ -99,7 +99,7 @@ impl BatchFileReader {
         // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while
         for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) {
             f(Records {
-                lines: chunk.collect(),
+                items: chunk.collect(),
             });
         }
 
@@ -118,7 +118,7 @@ mod tests {
     fn read_lines() {
         let mut BatchFileReader = BatchFileReader::default();
         let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap();
-        assert_eq!(3, records.lines.len());
+        assert_eq!(3, records.items.len());
     }
 
     // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context.

From ec7c7363423420a1a39f031a5cc9aa2b5aadbccc Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 31 May 2024 14:57:49 +0100
Subject: [PATCH 24/30] Removing dead code.

---
 src/file_utils/file_utility.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 8a58c01..1e0b69e 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -5,9 +5,6 @@ use std::io::{BufRead, BufReader};
 use crate::errors::file_error::FileError;
 use itertools::Itertools;
 
-#[allow(dead_code)]
-struct RecordLine(String);
-
 #[allow(dead_code)]
 struct Records<T> {
     items: Vec<T>,

From 6e078ec1787edfdede3ceeec653aa805a0584563 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 31 May 2024 15:32:15 +0100
Subject: [PATCH 25/30] Added lib crate for modularity

---
 src/file_utils/file_utility.rs |  4 +-
 src/lib.rs                     | 83 +++++++++++++++++++++++++++++++++
 src/main.rs                    | 85 +++-------------------------------
 3 files changed, 93 insertions(+), 79 deletions(-)
 create mode 100644 src/lib.rs

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 1e0b69e..6ccb07f 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -71,6 +71,8 @@ impl BatchFileReader {
      * Reads a file batch by batch, and applies a function Fn for each chunk
      * Function pointers documentation: https://doc.rust-lang.org/book/ch19-05-advanced-functions-and-closures.html#function-pointers
      * f is a closure pushed into the stack of read_file_by_batch that is similar to an anonymous function in Java/JavaScript/C#
+     * https://doc.rust-lang.org/book/ch13-01-closures.html#moving-captured-values-out-of-closures-and-the-fn-traits
+     * Note that f is not intended to mutate the captured Records value, and should not return anything (i.e., move the captured Record value out of the closure).
      */
     pub fn read_file_by_batch(
         &mut self,
@@ -129,6 +131,6 @@ mod tests {
         let mut BatchFileReader = BatchFileReader::default();
         BatchFileReader
             .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function)
-            .unwrap();
+            .unwrap_or_else(|e| panic!("Error: {:?}", e));
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..6fe7e03
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,83 @@
+use clap::Parser;
+
+use cli::{Cli, Commands};
+use std::io::Error;
+
+// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html
+use crate::processors::exclude_seq::filter_fasta;
+use crate::processors::map_headers::map_fasta_head;
+use crate::processors::remap_head::remapping_head;
+use crate::processors::split_by_count::split_file_by_count;
+use crate::processors::split_by_size::split_file_by_size;
+use crate::processors::tpf_fasta::curate_fasta;
+use crate::processors::yaml_validator::validate_yaml;
+
+mod cli;
+mod errors;
+mod file_utils;
+mod generics;
+//use crate::generics::validate_fasta;
+
+mod processors;
+
+pub fn run() -> Result<(), Error> {
+    let cli = Cli::parse();
+
+    match &cli.command {
+        Some(Commands::YamlValidator {
+            yaml,
+            verbose,
+            output,
+        }) => validate_yaml(yaml, verbose, output),
+        Some(Commands::SplitByCount {
+            fasta_file,
+            output_directory,
+            data_type,
+            sanitise,
+            count,
+        }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count),
+        Some(Commands::SplitBySize {
+            fasta_file,
+            mem_size,
+            output_directory,
+        }) => split_file_by_size(fasta_file, mem_size, output_directory),
+        Some(Commands::MapHeaders {
+            fasta_file,
+            output_directory,
+            replace_with,
+        }) => _ = map_fasta_head(fasta_file, output_directory, replace_with),
+        Some(Commands::ReMapHeaders {
+            fasta_file,
+            output_directory,
+            map_file,
+        }) => remapping_head(fasta_file, output_directory, map_file),
+        Some(Commands::Curate {
+            fasta,
+            tpf,
+            sort,
+            output,
+            n_length,
+        }) => curate_fasta(fasta, tpf, sort, output, n_length),
+        Some(Commands::FilterFasta {
+            fasta,
+            output,
+            filter_list,
+        }) => filter_fasta(fasta, output, filter_list),
+        Some(Commands::GenesetCSVS { .. }) => {
+            todo!()
+        }
+        Some(Commands::Profile { .. }) => {
+            todo!()
+        }
+        Some(Commands::Subset { .. }) => {
+            todo!()
+        }
+        Some(Commands::Mergehaps { .. }) => {
+            todo!()
+        }
+        None => {
+            panic!("No command given!")
+        }
+    }
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index d66d003..8bf8d2f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,85 +2,14 @@
 
 use std::io::Error;
 
-use clap::Parser;
-
-use cli::{Cli, Commands};
-
-// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html
-use crate::processors::exclude_seq::filter_fasta;
-use crate::processors::map_headers::map_fasta_head;
-use crate::processors::remap_head::remapping_head;
-use crate::processors::split_by_count::split_file_by_count;
-use crate::processors::split_by_size::split_file_by_size;
-use crate::processors::tpf_fasta::curate_fasta;
-use crate::processors::yaml_validator::validate_yaml;
-
-mod cli;
-mod errors;
-mod file_utils;
-mod generics;
-//use crate::generics::validate_fasta;
-
-mod processors;
+use fasta_manipulation::run;
 
+// https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects
 fn main() -> Result<(), Error> {
-    let cli = Cli::parse();
-
-    match &cli.command {
-        Some(Commands::YamlValidator {
-            yaml,
-            verbose,
-            output,
-        }) => validate_yaml(yaml, verbose, output),
-        Some(Commands::SplitByCount {
-            fasta_file,
-            output_directory,
-            data_type,
-            sanitise,
-            count,
-        }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count),
-        Some(Commands::SplitBySize {
-            fasta_file,
-            mem_size,
-            output_directory,
-        }) => split_file_by_size(fasta_file, mem_size, output_directory),
-        Some(Commands::MapHeaders {
-            fasta_file,
-            output_directory,
-            replace_with,
-        }) => _ = map_fasta_head(fasta_file, output_directory, replace_with),
-        Some(Commands::ReMapHeaders {
-            fasta_file,
-            output_directory,
-            map_file,
-        }) => remapping_head(fasta_file, output_directory, map_file),
-        Some(Commands::Curate {
-            fasta,
-            tpf,
-            sort,
-            output,
-            n_length,
-        }) => curate_fasta(fasta, tpf, sort, output, n_length),
-        Some(Commands::FilterFasta {
-            fasta,
-            output,
-            filter_list,
-        }) => filter_fasta(fasta, output, filter_list),
-        Some(Commands::GenesetCSVS { .. }) => {
-            todo!()
-        }
-        Some(Commands::Profile { .. }) => {
-            todo!()
-        }
-        Some(Commands::Subset { .. }) => {
-            todo!()
-        }
-        Some(Commands::Mergehaps { .. }) => {
-            todo!()
-        }
-        None => {
-            panic!("No command given!")
-        }
+    if let Err(e) = run() {
+        eprintln!("Error: {}", e);
+        std::process::exit(1);
+    } else {
+        Ok(())
     }
-    Ok(())
 }

From 9d263da75527df5e27c5edd04cfda4f028905c8e Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 31 May 2024 15:35:18 +0100
Subject: [PATCH 26/30] Updated clippy fixes

---
 src/file_utils/file_utility.rs   |  8 ++++----
 src/processors/yaml_validator.rs | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs
index 6ccb07f..8a30eaa 100644
--- a/src/file_utils/file_utility.rs
+++ b/src/file_utils/file_utility.rs
@@ -115,8 +115,8 @@ mod tests {
 
     #[test]
     fn read_lines() {
-        let mut BatchFileReader = BatchFileReader::default();
-        let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap();
+        let mut batch_file_reader = BatchFileReader::default();
+        let records = batch_file_reader.read_lines(TEST_FILE_PATH, 3).unwrap();
         assert_eq!(3, records.items.len());
     }
 
@@ -128,8 +128,8 @@ mod tests {
 
     #[test]
     fn read_file_batch() {
-        let mut BatchFileReader = BatchFileReader::default();
-        BatchFileReader
+        let mut batch_file_reader = BatchFileReader::default();
+        batch_file_reader
             .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function)
             .unwrap_or_else(|e| panic!("Error: {:?}", e));
     }
diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs
index f7a5172..a0a595c 100644
--- a/src/processors/yaml_validator.rs
+++ b/src/processors/yaml_validator.rs
@@ -29,9 +29,9 @@ struct Assembly {
     level: String,
     sample_id: String,
     latin_name: String,
-    classT: String,
-    asmVersion: u16,
-    gevalType: String,
+    class_t: String,
+    asm_version: u16,
+    geval_type: String,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -233,7 +233,7 @@ pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) {
     let genesets = contents.alignment.geneset.split(',');
     for set in genesets {
         let gene_alignment_path = contents.alignment.data_dir.clone()
-            + &contents.assembly.classT
+            + &contents.assembly.class_t
             + "/csv_data/"
             + set
             + "-data.csv";
@@ -242,7 +242,7 @@ pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) {
 
     println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue());
     let synteny_full =
-        contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/";
+        contents.synteny.synteny_genome_path.clone() + &contents.assembly.class_t + "/";
     validate_paths(&synteny_full, "SYNTENY-FASTA");
     validate_data(&synteny_full, "synteny");
 

From b0d7592bf23f00f6f96519263518a1a6ea4820ed Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 31 May 2024 16:03:13 +0100
Subject: [PATCH 27/30] Refactoring main function

---
 src/main.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 8bf8d2f..3282090 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,13 @@
 #![allow(non_snake_case)]
 
-use std::io::Error;
-
 use fasta_manipulation::run;
 
 // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects
-fn main() -> Result<(), Error> {
+fn main() {
     if let Err(e) = run() {
         eprintln!("Error: {}", e);
         std::process::exit(1);
     } else {
-        Ok(())
+        println!("Done!");
     }
 }

From f0b5fbd8dddaeb5e637c284178359f12e2973317 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Fri, 31 May 2024 16:14:49 +0100
Subject: [PATCH 28/30] Refactoring lib

---
 src/lib.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 6fe7e03..36ac37b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,7 +16,6 @@ mod cli;
 mod errors;
 mod file_utils;
 mod generics;
-//use crate::generics::validate_fasta;
 
 mod processors;
 
@@ -76,7 +75,7 @@ pub fn run() -> Result<(), Error> {
             todo!()
         }
         None => {
-            panic!("No command given!")
+            println!("No command provided");
         }
     }
     Ok(())

From 9b5caadb668e039b33bd7dd0f0778b3522bfbc93 Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Tue, 4 Jun 2024 18:19:16 +0100
Subject: [PATCH 29/30] Setup human-panic

---
 Cargo.lock  | 289 +++++++++++++++++++++++++++++++++++++++++++++++-----
 Cargo.toml  |   1 +
 src/main.rs |   3 +
 3 files changed, 266 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6868692..5baf7b5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "addr2line"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
+dependencies = [
+ "gimli",
+]
+
 [[package]]
 name = "adler"
 version = "1.0.2"
@@ -44,8 +53,23 @@ dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
- "anstyle-wincon",
+ "anstyle-wincon 2.1.0",
+ "colorchoice",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon 3.0.3",
  "colorchoice",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
@@ -70,7 +94,7 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -80,7 +104,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd"
 dependencies = [
  "anstyle",
- "windows-sys",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -98,6 +132,21 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
+[[package]]
+name = "backtrace"
+version = "0.3.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
 [[package]]
 name = "bit-vec"
 version = "0.6.3"
@@ -179,12 +228,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
-dependencies = [
- "libc",
-]
+checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
 
 [[package]]
 name = "cfg-if"
@@ -208,7 +254,7 @@ version = "4.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56"
 dependencies = [
- "anstream",
+ "anstream 0.5.0",
  "anstyle",
  "clap_lex",
  "strsim",
@@ -246,7 +292,7 @@ checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
 dependencies = [
  "is-terminal",
  "lazy_static",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -415,7 +461,7 @@ checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd"
 dependencies = [
  "errno-dragonfly",
  "libc",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -436,6 +482,7 @@ dependencies = [
  "colored",
  "compare",
  "csv",
+ "human-panic",
  "io",
  "itertools",
  "log",
@@ -587,6 +634,23 @@ dependencies = [
  "version_check",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "gimli"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+
 [[package]]
 name = "gmeta"
 version = "1.3.0"
@@ -665,6 +729,22 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
+[[package]]
+name = "human-panic"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4c5d0e9120f6bca6120d142c7ede1ba376dd6bf276d69dd3dbe6cbeb7824179"
+dependencies = [
+ "anstream 0.6.14",
+ "anstyle",
+ "backtrace",
+ "os_info",
+ "serde",
+ "serde_derive",
+ "toml",
+ "uuid",
+]
+
 [[package]]
 name = "impl-codec"
 version = "0.6.0"
@@ -715,9 +795,15 @@ checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
  "hermit-abi",
  "rustix",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
 [[package]]
 name = "itertools"
 version = "0.13.0"
@@ -976,12 +1062,32 @@ dependencies = [
  "noodles-csi",
 ]
 
+[[package]]
+name = "object"
+version = "0.35.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "os_info"
+version = "3.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092"
+dependencies = [
+ "log",
+ "serde",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "page_size"
 version = "0.6.0"
@@ -1010,7 +1116,7 @@ version = "3.6.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be30eaf4b0a9fba5336683b38de57bb86d179a35862ba6bfcf57625d006bde5b"
 dependencies = [
- "proc-macro-crate 2.0.2",
+ "proc-macro-crate 2.0.0",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
@@ -1058,11 +1164,10 @@ dependencies = [
 
 [[package]]
 name = "proc-macro-crate"
-version = "2.0.2"
+version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b00f26d3400549137f92511a46ac1cd8ce37cb5598a96d382381458b992a5d24"
+checksum = "7e8366a6159044a37876a2b9817124296703c586a5c92e2c53751fa06d8d43e8"
 dependencies = [
- "toml_datetime",
  "toml_edit 0.20.2",
 ]
 
@@ -1122,6 +1227,12 @@ version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
 
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
 [[package]]
 name = "rustc_version"
 version = "0.4.0"
@@ -1141,7 +1252,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1200,6 +1311,15 @@ dependencies = [
  "syn 2.0.60",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_yaml"
 version = "0.9.25"
@@ -1281,11 +1401,26 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
+[[package]]
+name = "toml"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.22.14",
+]
+
 [[package]]
 name = "toml_datetime"
-version = "0.6.3"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b"
+checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "toml_edit"
@@ -1309,6 +1444,18 @@ dependencies = [
  "winnow",
 ]
 
+[[package]]
+name = "toml_edit"
+version = "0.22.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
+dependencies = [
+ "indexmap",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+]
+
 [[package]]
 name = "typenum"
 version = "1.17.0"
@@ -1351,12 +1498,27 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
+[[package]]
+name = "uuid"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -1385,7 +1547,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -1394,13 +1565,29 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -1409,42 +1596,90 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
+
 [[package]]
 name = "winnow"
 version = "0.5.40"
diff --git a/Cargo.toml b/Cargo.toml
index 2f4477f..a523174 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,3 +18,4 @@ serde = { version = "1.0.188", features = ["derive"] }
 serde_yaml = "0.9.25"
 stacker = "0.1.15"
 log = "0.4.21"
+human-panic = "2.0.0"
diff --git a/src/main.rs b/src/main.rs
index 3282090..5fafbc2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,12 @@
 #![allow(non_snake_case)]
 
+use human_panic::setup_panic;
 use fasta_manipulation::run;
 
 // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects
 fn main() {
+    // https://rust-cli.github.io/book/in-depth/human-communication.html
+    setup_panic!();
     if let Err(e) = run() {
         eprintln!("Error: {}", e);
         std::process::exit(1);

From 6b1d389430a31cd9ec115d73478f8c13a3e905fc Mon Sep 17 00:00:00 2001
From: Dasun Pubudumal <pubudumald@gmail.com>
Date: Tue, 4 Jun 2024 18:21:43 +0100
Subject: [PATCH 30/30] Fix linting issues

---
 src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index 5fafbc2..6163c35 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,7 +1,7 @@
 #![allow(non_snake_case)]
 
-use human_panic::setup_panic;
 use fasta_manipulation::run;
+use human_panic::setup_panic;
 
 // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects
 fn main() {