From 246fa5c0dd7a03b76d1a9aa66e31a7a5188c7e14 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:22:17 +0100 Subject: [PATCH 01/30] Refactoring with Clap enums --- Cargo.lock | 19 ++ Cargo.toml | 2 +- src/cli/mod.rs | 184 ++++++++++++++++ src/exclude_seq.rs | 44 ---- src/main.rs | 353 ++++--------------------------- src/map_headers.rs | 142 ------------- src/processors.rs | 7 + src/processors/exclude_seq.rs | 40 ++++ src/processors/map_headers.rs | 131 ++++++++++++ src/processors/remap_head.rs | 66 ++++++ src/processors/split_by_count.rs | 100 +++++++++ src/processors/split_by_size.rs | 10 + src/processors/tpf_fasta.rs | 272 ++++++++++++++++++++++++ src/processors/yaml_validator.rs | 265 +++++++++++++++++++++++ src/remap_head.rs | 73 ------- src/split_by_count.rs | 111 ---------- src/split_by_size.rs | 12 -- src/tpf_fasta.rs | 277 ------------------------ src/yaml_validator.rs | 272 ------------------------ 19 files changed, 1141 insertions(+), 1239 deletions(-) create mode 100644 src/cli/mod.rs delete mode 100644 src/exclude_seq.rs delete mode 100644 src/map_headers.rs create mode 100644 src/processors.rs create mode 100644 src/processors/exclude_seq.rs create mode 100644 src/processors/map_headers.rs create mode 100644 src/processors/remap_head.rs create mode 100644 src/processors/split_by_count.rs create mode 100644 src/processors/split_by_size.rs create mode 100644 src/processors/tpf_fasta.rs create mode 100644 src/processors/yaml_validator.rs delete mode 100644 src/remap_head.rs delete mode 100644 src/split_by_count.rs delete mode 100644 src/split_by_size.rs delete mode 100644 src/tpf_fasta.rs delete mode 100644 src/yaml_validator.rs diff --git a/Cargo.lock b/Cargo.lock index 7630133..a09d981 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,6 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -213,6 +214,18 @@ dependencies = [ "strsim", ] +[[package]] +name = "clap_derive" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "clap_lex" version = "0.5.1" @@ -626,6 +639,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.3" diff --git a/Cargo.toml b/Cargo.toml index 4c55605..b6f6872 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -clap = { version = "4.4.4", features = ["cargo"] } +clap = { version = "4.4.4", features = ["cargo", "derive"] } colored = "2.0.4" compare = "0.1.0" csv = "1.3.0" diff --git a/src/cli/mod.rs b/src/cli/mod.rs new file mode 100644 index 0000000..74d50ef --- /dev/null +++ b/src/cli/mod.rs @@ -0,0 +1,184 @@ +use clap::{Parser, Subcommand}; +use clap::builder::Str; + +const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"]; + +// CLI for Fasta Processing +#[derive(Parser)] +#[command(version="v1.0.0", about, long_about = None)] +pub struct Cli { + // command is optional (TODO: Make this not optional) + // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html#defaults + #[command(subcommand)] + pub command: Option +} + +// Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html +#[derive(Subcommand)] +pub enum Commands { + YamlValidator { + // Path to the TreeVal yaml file generated by the user + #[arg(short, long)] + yaml: String, + + // Print explainers as to why validation fails, if it does fail + #[arg(short = 'v', long)] + verbose: bool, + + // Output the log to file + #[arg(short = 'o', long, default_value_t=String::from("./"))] + output: String + }, + + SplitByCount { + + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + + // The data type of the input data + #[arg(short = 'd', value_parser = clap::builder::PossibleValuesParser::new(SPLIT_OPTIONS))] + data_type: String , + + // Do we need to sanitise the headers of the input fasta + #[arg(short = 's', value_parser = clap::value_parser!(bool))] + sanitise: bool, + + // How many sequences per file + #[arg(short = 'c', value_parser = clap::value_parser!(u16))] + count: u16, + }, + + SplitBySize { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // Size in MB that a fasta file is to be chunked into + #[arg(short = 's', long = "mem-size")] + mem_size: u16, + + // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + }, + + GenesetCSVS { + // The path to the top level directory of your geneset directory. + #[arg(short = 'd')] + geneset_dir: String, + + // Specify the clade folder to refresh + #[arg(short = 'c', default_value_t = String::from("ALL"))] + specifiy_clade: String, + }, + + MapHeaders { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + + #[arg(short = 'r', default_value_t = String::from("FMMH"))] + replace_with: String + }, + + ReMapHeaders { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta + #[arg(short = 'o', long, default_value_t = String::from("./new"))] + output_directory: String, + + // "The original mapped header field, a TSV of old-header, new-header + #[arg(short = 'm', default_value_t = String::from("FMMH"))] + map_file: String + }, + + #[command(version, about="Profile an input fasta file and return various statistics", long_about = None)] + Profile { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The input fasta file for profiling + #[arg(short = 'o', long, default_value_t = String::from("FasMan-out"))] + output_dir: String + }, + + Curate { + // The input fasta file for re-organising + #[arg(short = 'f', long)] + fasta: String, + + // The TPF file used to re-organise the input fasta + #[arg(short = 't', long)] + tpf: String, + + // Size sort the output or leave as order in AGP + #[arg(short = 's')] + sort: bool, + + #[arg(short = 'o', default_value_t = String::from("new.fasta"))] + output: String, + + // Length that the N (gap) string should be. + #[arg(short, long, default_value_t = 200)] + n_length: usize + }, + + Subset { + // A path to a valid fasta file for profiling. + #[arg(short = 'f', long)] + fasta_file: String, + + // Random subset of input file. Default skims the first X given percent + #[arg(short = 'r', long)] + random: bool, + + // Percentage of the original file entries that should be retained + #[arg(short = 'p', long, default_value_t = 50)] + percent: u16 + }, + + FilterFasta { + // A fasta file for processing. + #[arg(short = 'f', long)] + fasta: String, + + // The outfile naming + #[arg(short = 'o', default_value_t = String::from("FilteredFasta.fa"))] + output: String, + + #[arg(short = 'l', long = "filter_list")] + filter_list: String + }, + + Mergehaps { + + // The input fasta file for re-organising + #[arg(short = 'p', long)] + fasta_1: String, + + // The second input fasta file + #[arg(short = 's', long)] + fasta_2: String, + + // TA '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output + #[arg(short = 's', long, default_value_t = String::from("PRI/HAP"))] + naming: String, + + // Output file prefix + #[arg(short = 'o', default_value_t = String::from("merged"))] + output: String, + } +} \ No newline at end of file diff --git a/src/exclude_seq.rs b/src/exclude_seq.rs deleted file mode 100644 index ab82c4e..0000000 --- a/src/exclude_seq.rs +++ /dev/null @@ -1,44 +0,0 @@ -pub mod exclude_seq_mod { - use clap::ArgMatches; - use noodles::fasta; - use std::error::Error; - use std::{fs, io::BufRead, str}; - - fn open_fasta<'a>( - exclusions: Vec<&str>, - fasta: &'a str, - out_file: &str, - ) -> std::result::Result<&'a str, Box> { - let reader: Result>, std::io::Error> = - fasta::reader::Builder.build_from_path(fasta); - let file = fs::OpenOptions::new() - .create(true) - .append(true) - .open(out_file)?; - let mut writer = fasta::Writer::new(file); - - match reader { - Ok(fasta) => { - let mut binding = fasta; - for result in binding.records() { - let record = result?; - if !exclusions.contains(&record.name()) { - writer.write_record(&record)?; - } else { - println!("Found record to exclude: {:?}", &record.name()); - } - } - Ok("Removed Exclusionary List") - } - Err(_) => Err("Error: Fasta is not valid check file!".into()), - } - } - - pub fn filter_fasta(arguments: std::option::Option<&ArgMatches>) { - let fasta = arguments.unwrap().get_one::("fasta").unwrap(); - let exclude = arguments.unwrap().get_one::("filter_list").unwrap(); - let outfile = arguments.unwrap().get_one::("output").unwrap(); - let list_to_exclude = exclude.split(',').collect::>(); - let _x = open_fasta(list_to_exclude, fasta, outfile); - } -} diff --git a/src/main.rs b/src/main.rs index 9e947f9..0f635ac 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,327 +1,66 @@ #![allow(non_snake_case)] -use clap::{command, Arg, Command}; +use clap::{command, Arg, Command, Parser}; use colored::Colorize; use std::env; use std::io::Error; -mod yaml_validator; -use crate::yaml_validator::yaml_validator_mod::validate_yaml; - -mod map_headers; -use crate::map_headers::mapping_headers::map_fasta_head; - -mod remap_head; -use crate::remap_head::remapping_headers::remapping_head; - -mod split_by_size; -use crate::split_by_size::split_by_size_mod::split_file_by_size; - -mod split_by_count; -use crate::split_by_count::split_by_count_mod::split_file_by_count; +mod cli; +use cli::{Cli, Commands}; mod generics; //use crate::generics::validate_fasta; -mod tpf_fasta; -use crate::tpf_fasta::tpf_fasta_mod::curate_fasta; +// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html +use crate::processors::yaml_validator::validate_yaml; +use crate::processors::split_by_count::split_file_by_count; +use crate::processors::split_by_size::split_file_by_size; +use crate::processors::map_headers::map_fasta_head; +use crate::processors::remap_head::remapping_head; +use crate::processors::tpf_fasta::curate_fasta; +use crate::processors::exclude_seq::filter_fasta; +mod processors; -mod exclude_seq; -use crate::exclude_seq::exclude_seq_mod::filter_fasta; fn main() -> Result<(), Error> { - let split_options = ["pep", "cds", "cdna", "rna", "other"]; - let match_result = command!() - .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project") - .subcommand( - Command::new("validateyaml") - .about("Subcommand for validating the users TreeVal yaml file") - .arg( - Arg::new("yaml") - .required(true) - .help("Path to the TreeVal yaml file generated by the user") - ) - .arg( - Arg::new("verbose") - .short('v') - .value_parser(clap::value_parser!(bool)) - .default_value("false") - .help("Print explainers as to why validation fails, if it does fail") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("./") - .help("Output the log to file") - ) - ) - .subcommand( - Command::new("splitbycount") - .about("Subcommand for splitting fasta files by number of sequence-header pairs, e.g., 100 pairs per file") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa") - ) - .arg( - Arg::new("data_type") - .short('d') - .value_parser(clap::builder::PossibleValuesParser::new(split_options)) - .help("The data type of the input data") - ) - .arg( - Arg::new("sanitise") - .short('s') - .value_parser(clap::value_parser!(bool)) - .help("Do we need to sanitise the headers of the input fasta") - ) - .arg( - Arg::new("count") - .short('c') - .value_parser(clap::value_parser!(u16)) - .help("How many sequences per file") - ) - ) - .subcommand( - Command::new("splitbysize") - .about("Subcommand for splitting fasta files by user given size (in MegaBytes) into n (fasta_size / user_given_size) files") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("mem-size") - .short('s') - .required(true) - .value_parser(clap::value_parser!(u16)) - .help("Size in MB that a fasta file is to be chunked into") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory that files will be placed in") - ) - ) - .subcommand( - Command::new("geneset_csvs") - .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal") - .arg( - Arg::new("geneset_dir") - .short('d') - .required(true) - .help("The path to the top level directory of your geneset directory.") - ) - .arg( - Arg::new("specifiy_clade") - .short('c') - .required(true) - .default_value("ALL") - .help("Specify the clade folder to refresh") - ) - ) - .subcommand( - Command::new("mapheaders") - .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") - ) - .arg( - Arg::new("replace-with") - .short('r') - .default_value("FMMH") - .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'") - ) - ) - .subcommand( - Command::new("remapheaders") - .about("Subcommand for stripping out previously mapped headers and replacing with the old headers") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./new") - .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") - ) - .arg( - Arg::new("map-file") - .short('m') - .required(true) - .help("The original mapped header field, a TSV of old-header, new-header") - ) - ) - .subcommand( - Command::new("profile") - .about("Profile an input fasta file and return various statistics") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("The input fasta file for profiling") - ) - .arg( - Arg::new("output-dir") - .short('o') - .default_value("FasMan-out") - .help("The input fasta file for profiling") - ) - ) - .subcommand( - Command::new("curate") - .about("Convert an tpf file and original fasta file into a fasta file - useful for curation") - .arg( - Arg::new("fasta") - .short('f') - .required(true) - .help("The input fasta file for re-organising") - ) - .arg( - Arg::new("tpf") - .short('t') - .required(true) - .help("The TPF file used to re-organise the input fasta") - ) - .arg( - Arg::new("sort") - .short('s') - .value_parser(clap::value_parser!(bool)) - .default_value("false") - .help("Size sort the output or leave as order in AGP") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("new.fasta") - .help("The output name of the new fasta file") - ) - .arg( - Arg::new("n_length") - .value_parser(clap::value_parser!(usize)) - .default_value("200") - .help("Length that the N (gap) string should be.") - ) - ) - .subcommand( - Command::new("subset") - .about("Subset a fasta file in a random manner by percentage of file") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("The input fasta file for profiling") - ) - .arg( - Arg::new("random") - .short('r') - .value_parser(clap::value_parser!(bool)) - .help("Random subset of input file. Default skims the first X given percent") - ) - .arg( - Arg::new("percent") - .short('p') - .value_parser(clap::value_parser!(u16)) - .default_value("50") - .help("Percentage of the original file entries that should be retained") - ) - ) - .subcommand( - Command::new("filterfasta") - .about("Filter a given list of sequences from fasta file") - .arg( - Arg::new("fasta") - .short('f') - .required(true) - .help("A fasta file for processing") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("FiilteredFasta.fa") - .help("The outfile naming") - ) - .arg( - Arg::new("filter_list") - .short('l') - .help("A string comma-separated list of sequence names to exclude from the final fasta") - ) - ) - .subcommand( - Command::new("mergehaps") - .about("Merge haplotypes / multi fasta files together") - .arg( - Arg::new("fasta-1") - .short('p') - .required(true) - .help("The input fasta file for re-organising") - ) - .arg( - Arg::new("fasta-2") - .short('s') - .required(true) - .help("The second input fasta file") - ) - .arg( - Arg::new("naming") - .short('s') - .default_value("PRI/HAP") - .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("merged") - .help("Output file prefix") - ) - ) - .get_matches(); - println! { - "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}", - "WELCOME TO Fasta Manipulator".bold(), - "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(), - "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(), - match_result.subcommand_name().unwrap(), - env::consts::OS - }; + let cli = Cli::parse(); - match match_result.subcommand_name() { - Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")), - Some("splitbycount") => { - split_file_by_count(match_result.subcommand_matches("splitbycount")) + match &cli.command { + Some(Commands::YamlValidator { yaml, verbose, output }) => { + validate_yaml(yaml, verbose, output) + }, + Some(Commands::SplitByCount { fasta_file, output_directory, data_type, sanitise, count}) => { + split_file_by_count( + fasta_file, output_directory, data_type, sanitise, count + ) + }, + Some(Commands::SplitBySize { fasta_file, mem_size, output_directory }) => { + split_file_by_size( + fasta_file, + mem_size, + output_directory + ) + }, + Some(Commands::MapHeaders { fasta_file, output_directory, replace_with }) => { + _ = map_fasta_head(fasta_file, output_directory, replace_with) + }, + Some(Commands::ReMapHeaders { fasta_file, output_directory, map_file }) => { + remapping_head(fasta_file, output_directory, map_file) + } + Some(Commands::Curate { fasta, tpf, sort, output, n_length}) => { + curate_fasta(fasta, tpf, sort, output, n_length) } - Some("mapheaders") => { - _ = map_fasta_head(match_result.subcommand_matches("mapheaders")); + Some(Commands::FilterFasta { fasta, output, filter_list }) => { + filter_fasta(fasta, output, filter_list) } - Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")), - Some("remapheaders") => remapping_head(match_result.subcommand_matches("remapheaders")), - Some("curate") => curate_fasta(match_result.subcommand_matches("curate")), - Some("filterfasta") => filter_fasta(match_result.subcommand_matches("filterfasta")), - _ => { - unreachable!() + Some(Commands::GenesetCSVS { .. }) => { todo!() }, + Some(Commands::Profile { .. }) => { todo!() } + Some(Commands::Subset { .. }) => { todo!() } + Some(Commands::Mergehaps { .. }) => { todo!() } + None => { + panic!("No command given!") } - }; + } Ok(()) } diff --git a/src/map_headers.rs b/src/map_headers.rs deleted file mode 100644 index 2b066b0..0000000 --- a/src/map_headers.rs +++ /dev/null @@ -1,142 +0,0 @@ -pub mod mapping_headers { - - use clap::ArgMatches; - use colored::Colorize; - use std::error::Error; - use std::fmt; - use std::fs::File; - use std::io::{BufRead, BufReader, BufWriter, Write}; - use std::iter::Zip; - - use crate::generics::only_keys; - use crate::generics::validate_fasta; - - #[allow(dead_code)] - #[derive(Debug, Clone)] - struct EmptyVec; - impl Error for EmptyVec {} - - impl fmt::Display for EmptyVec { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Can't Display Empty Vec") - } - } - - #[allow(clippy::explicit_counter_loop)] - pub fn create_mapping( - name_vec: Vec, - new_name: &str, - ) -> Zip, std::vec::IntoIter> { - // Generate a new mapping for the Fasta - // - let mut new_heads: Vec = Vec::new(); - let mut head_counter: i32 = 0; - let name_vec_clone = name_vec.clone(); - - for _x in name_vec { - new_heads.push(format!("{}_{}", new_name, head_counter)); - head_counter += 1; - } - - let mapped_heads: Zip, std::vec::IntoIter> = - name_vec_clone.into_iter().zip(new_heads); - - mapped_heads - } - - pub fn save_mapping( - output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, - ) { - let f: File = File::create(output).expect("Unable to create file"); - let mut f: BufWriter = BufWriter::new(f); - for map_pair in mapped { - let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1); - f.write_all(&line.into_bytes()) - .expect("Unable to write data"); - } - } - - #[allow(unused_mut)] - pub fn create_mapped_fasta( - input: &str, - output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, - ) { - let file_reader: File = File::open(input).expect("CAN'T OPEN FILE"); - let buff_reader: BufReader = BufReader::new(file_reader); - let mut new_fasta: File = File::create(output).unwrap(); - - for line in buff_reader.lines() { - let l: &str = &line.as_ref().unwrap()[..]; - if l.starts_with('>') { - let mut to_replace = l.replace('>', ""); - let mut mapped_heads: Zip, std::vec::IntoIter> = - mapped.clone(); - let mut map: Option<(String, String)> = - mapped_heads.find(|x: &(String, String)| x.0 == to_replace); - let mut new_head: String = map.expect("").1; - let fmt_head: String = format!(">{}\n", new_head); - let _ = new_fasta.write_all(&fmt_head.into_bytes()); - } else { - let mut seq = line.expect(""); - let fmt_seq = format!("{}\n", seq); - let _ = new_fasta.write_all(&fmt_seq.into_bytes()); - } - } - } - - pub fn map_fasta_head( - arguments: std::option::Option<&ArgMatches>, - ) -> Result<(), Box> { - let file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - let replacer: &String = arguments - .unwrap() - .get_one::("replace-with") - .unwrap(); - let output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - println!("Mapping headers for file: {}", file); - println!("Replace headers with string: {:?}", &replacer); - - match validate_fasta(file) { - Ok(names) => { - let new_names = Vec::from_iter(only_keys(names)); - - let new_map: Zip, std::vec::IntoIter> = - create_mapping(new_names, replacer); - - let map_to_save: Zip, std::vec::IntoIter> = - new_map.clone(); - let output_file = format!("{}mapped-heads.tsv", output); - - save_mapping(&output_file, map_to_save); - - let new_fasta: String = format!("{output}mapped.fasta"); - - create_mapped_fasta(file, &new_fasta, new_map); - - println!( - "{}\n{}\n\t{}\n\t{}", - "FASTA HAS BEEN MAPPED AND REWRITTEN".green(), - "FOUND HERE:".green(), - &new_fasta.green(), - &output_file.green() - ); - } - - Err(e) => panic!("Something is wrong with the file! | {}", e), - }; - - Ok(()) - } -} diff --git a/src/processors.rs b/src/processors.rs new file mode 100644 index 0000000..0c1ad7f --- /dev/null +++ b/src/processors.rs @@ -0,0 +1,7 @@ +pub mod yaml_validator; +pub mod split_by_count; +pub mod split_by_size; +pub mod map_headers; +pub mod remap_head; +pub mod tpf_fasta; +pub mod exclude_seq; diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs new file mode 100644 index 0000000..89ecc8d --- /dev/null +++ b/src/processors/exclude_seq.rs @@ -0,0 +1,40 @@ +use clap::ArgMatches; +use noodles::fasta; +use std::error::Error; +use std::{fs, io::BufRead, str}; + +fn open_fasta<'a>( + exclusions: Vec<&str>, + fasta: &'a str, + out_file: &str, +) -> std::result::Result<&'a str, Box> { + let reader: Result>, std::io::Error> = + fasta::reader::Builder.build_from_path(fasta); + let file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(out_file)?; + let mut writer = fasta::Writer::new(file); + + match reader { + Ok(fasta) => { + let mut binding = fasta; + for result in binding.records() { + let record = result?; + if !exclusions.contains(&record.name()) { + writer.write_record(&record)?; + } else { + println!("Found record to exclude: {:?}", &record.name()); + } + } + Ok("Removed Exclusionary List") + } + Err(_) => Err("Error: Fasta is not valid check file!".into()), + } +} + +pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) { + let list_to_exclude = exclude.split(',').collect::>(); + let _x = open_fasta(list_to_exclude, fasta, outfile); +} + diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs new file mode 100644 index 0000000..18a9847 --- /dev/null +++ b/src/processors/map_headers.rs @@ -0,0 +1,131 @@ +use clap::ArgMatches; +use colored::Colorize; +use std::error::Error; +use std::fmt; +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::iter::Zip; + +use crate::generics::only_keys; +use crate::generics::validate_fasta; + +#[allow(dead_code)] +#[derive(Debug, Clone)] +struct EmptyVec; +impl Error for EmptyVec {} + +impl fmt::Display for EmptyVec { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Can't Display Empty Vec") + } +} + +#[allow(clippy::explicit_counter_loop)] +pub fn create_mapping( + name_vec: Vec, + new_name: &str, +) -> Zip, std::vec::IntoIter> { + // Generate a new mapping for the Fasta + // + let mut new_heads: Vec = Vec::new(); + let mut head_counter: i32 = 0; + let name_vec_clone = name_vec.clone(); + + for _x in name_vec { + new_heads.push(format!("{}_{}", new_name, head_counter)); + head_counter += 1; + } + + let mapped_heads: Zip, std::vec::IntoIter> = + name_vec_clone.into_iter().zip(new_heads); + + mapped_heads +} + +pub fn save_mapping( + output: &str, + mapped: Zip< + std::vec::IntoIter, + std::vec::IntoIter, + >, +) { + let f: File = File::create(output).expect("Unable to create file"); + let mut f: BufWriter = BufWriter::new(f); + for map_pair in mapped { + let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1); + f.write_all(&line.into_bytes()) + .expect("Unable to write data"); + } +} + +#[allow(unused_mut)] +pub fn create_mapped_fasta( + input: &str, + output: &str, + mapped: Zip< + std::vec::IntoIter, + std::vec::IntoIter, + >, +) { + let file_reader: File = File::open(input).expect("CAN'T OPEN FILE"); + let buff_reader: BufReader = BufReader::new(file_reader); + let mut new_fasta: File = File::create(output).unwrap(); + + for line in buff_reader.lines() { + let l: &str = &line.as_ref().unwrap()[..]; + if l.starts_with('>') { + let mut to_replace = l.replace('>', ""); + let mut mapped_heads: Zip, std::vec::IntoIter> = + mapped.clone(); + let mut map: Option<(String, String)> = + mapped_heads.find(|x: &(String, String)| x.0 == to_replace); + let mut new_head: String = map.expect("").1; + let fmt_head: String = format!(">{}\n", new_head); + let _ = new_fasta.write_all(&fmt_head.into_bytes()); + } else { + let mut seq = line.expect(""); + let fmt_seq = format!("{}\n", seq); + let _ = new_fasta.write_all(&fmt_seq.into_bytes()); + } + } +} + +pub fn map_fasta_head( + file: &String, output: &String, replacer: &String +) -> Result<(), Box> { + + println!("Mapping headers for file: {}", file); + println!("Replace headers with string: {:?}", &replacer); + + match validate_fasta(file) { + Ok(names) => { + let new_names = Vec::from_iter(only_keys(names)); + + let new_map: Zip, std::vec::IntoIter> = + create_mapping(new_names, replacer); + + let map_to_save: Zip, std::vec::IntoIter> = + new_map.clone(); + let output_file = format!("{}mapped-heads.tsv", output); + + save_mapping(&output_file, map_to_save); + + let new_fasta: String = format!("{output}mapped.fasta"); + + create_mapped_fasta(file, &new_fasta, new_map); + + println!( + "{}\n{}\n\t{}\n\t{}", + "FASTA HAS BEEN MAPPED AND REWRITTEN".green(), + "FOUND HERE:".green(), + &new_fasta.green(), + &output_file.green() + ); + } + + Err(e) => panic!("Something is wrong with the file! | {}", e), + }; + + Ok(()) +} + diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs new file mode 100644 index 0000000..44ef5d5 --- /dev/null +++ b/src/processors/remap_head.rs @@ -0,0 +1,66 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::iter::Zip; + +use colored::Colorize; + +use crate::generics::validate_fasta; +use crate::processors::map_headers::create_mapped_fasta; + +pub fn pull_map_from_tsv( + map_file: &str, +) -> Zip, std::vec::IntoIter> { + let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE"); + let buff_reader: BufReader = BufReader::new(file_reader); + + let mut old_head: Vec = Vec::new(); + let mut new_head: Vec = Vec::new(); + + for line in buff_reader.lines() { + match line { + Ok(string) => { + let mut old_new = string.split('\t'); + let x = old_new.next().unwrap(); + let y = old_new.next().unwrap(); + old_head.push(x.to_string()); + new_head.push(y.to_string()); + } + Err(_) => { + print!("") + } + }; + } + + let mapped_heads: Zip, std::vec::IntoIter> = + new_head.into_iter().zip(old_head); + + mapped_heads +} + +pub fn remapping_head(file: &String, output: &String, map_file: &String) { + + println!("Mapping headers for file: {}", file); + println!("Replace headers with string: {}", map_file); + + match validate_fasta(file) { + Ok(_thing) => { + let new_map: Zip, std::vec::IntoIter> = + pull_map_from_tsv(map_file); + + let new_fasta: String = format!("{output}_OH.fasta"); + + create_mapped_fasta(file, &new_fasta, new_map); + + println!( + "{}\n{}\n\t{}\n", + "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(), + "FOUND HERE:".green(), + &new_fasta.green() + ); + } + Err(_) => { + println!("NOT A VALID FASTA") + } + }; +} + diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs new file mode 100644 index 0000000..dda0b39 --- /dev/null +++ b/src/processors/split_by_count.rs @@ -0,0 +1,100 @@ +use crate::generics::sanitise_header; +use clap::ArgMatches; +use compare::{natural, Compare}; +use noodles::fasta::{self, Record}; +use std::cmp::Ordering; +use std::fs::OpenOptions; +use std::{ + fs::{create_dir_all, File}, + io::BufReader, + path::Path, +}; + +#[allow(clippy::needless_return)] +fn fix_head(records: Record, sanitise: bool) -> Record { + if sanitise { + let header = sanitise_header(records.definition()); + let definition = fasta::record::Definition::new(header, None); + let seq = records.sequence().to_owned(); + return fasta::Record::new(definition, seq); + } else { + return records.to_owned(); + }; +} + +fn write_fasta(outdir: &String, fasta_record: &Vec) { + println!("{}", outdir); + + let _data_file = File::create(outdir); + let file = OpenOptions::new() + .append(true) + .open(outdir) + .expect("creation failed"); + + let mut writer = fasta::Writer::new(file); + for i in fasta_record { + writer.write_record(i).unwrap(); + } +} + +pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_type: &String, sanitise: &bool, fasta_count: &u16) { + let path_obj = Path::new(fasta_file); + let grab_name = path_obj.file_name().unwrap(); + let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); + let actual_name = actual_list[0]; + + let new_outpath = format!("{}/{}/{}/", output_directory, actual_name, data_type); + create_dir_all(new_outpath.clone()).unwrap(); + println!( + "Fasta file for processing: {:?}\nNumber of records per file: {:?}", + fasta_file, fasta_count + ); + + let mut counter: u16 = 0; + let mut file_counter: u16 = 1; + + let file_name: Vec<&str> = actual_name.split('.').collect(); + + let mut reader = File::open(fasta_file) + .map(BufReader::new) + .map(fasta::Reader::new) + .unwrap(); + + let mut record_list: Vec = Vec::new(); + for result in reader.records() { + let record = result.unwrap(); + counter += 1; + + let final_rec = fix_head(record, *sanitise); + record_list.push(final_rec); + + let cmp = natural(); + let compared = cmp.compare(&counter, fasta_count); + if compared == Ordering::Equal { + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + + write_fasta(&full_outpath, &record_list); + file_counter += 1; + counter = 0; + record_list = Vec::new(); + } + } + + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + write_fasta(&full_outpath, &record_list); +} + diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs new file mode 100644 index 0000000..5a65719 --- /dev/null +++ b/src/processors/split_by_size.rs @@ -0,0 +1,10 @@ +use clap::ArgMatches; + +pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) { + println!("Fasta file for processing: {:?}", &fasta_file); + println!( + "Size to chunk fasta into: {:?}", + mem_size + ); +} + diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs new file mode 100644 index 0000000..f8863b3 --- /dev/null +++ b/src/processors/tpf_fasta.rs @@ -0,0 +1,272 @@ +use std::{fs::File, fs::read_to_string, str}; +use std::fs::OpenOptions; +use std::io::Write; + +use noodles::core::Position; +use noodles::fasta; +use noodles::fasta::record::Sequence; +use noodles::fasta::repository::adapters::IndexedReader; + +use crate::generics::validate_fasta; + +#[derive(Debug, Clone, PartialEq, Eq)] +struct Tpf { + ori_scaffold: String, + start_coord: usize, + end_coord: usize, + new_scaffold: String, + orientation: String, +} + +impl std::fmt::Display for Tpf { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!( + fmt, + "\t{} -- {} -- {}", + self.ori_scaffold, self.start_coord, self.end_coord + ) + } +} + +#[derive(Debug, PartialEq, Eq)] +struct NewFasta { + tpf: Tpf, + sequence: String, +} + +#[derive(Debug)] +struct MyRecord { + name: String, + sequence: Vec, +} + +fn parse_tpf(path: &String) -> Vec { + let mut all_tpf: Vec = Vec::new(); + for line in read_to_string(path).unwrap().lines() { + if line.starts_with('?') { + let line_replaced = line.replace('\t', " "); + let line_list: Vec<&str> = line_replaced.split_whitespace().collect(); + let scaff_data: Vec<&str> = line_list[1].split(':').collect(); + let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect(); + let data = Tpf { + ori_scaffold: scaff_data[0].to_owned(), + start_coord: scaff_coords[0].to_owned().parse::().unwrap(), + end_coord: scaff_coords[1].to_owned().parse::().unwrap(), + new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"), + orientation: line_list[3].to_owned(), + }; + all_tpf.push(data); + } + } + all_tpf +} + +fn subset_vec_tpf<'a>( + tpf: &'a Vec, + fasta: (&std::string::String, &usize), +) -> Vec<&'a Tpf> { + // + // Subset the Vec based on a search through the fasta + // + let mut subset_tpf: Vec<&Tpf> = Vec::new(); + for i in tpf { + if i.ori_scaffold == *fasta.0 { + subset_tpf.push(i) + } + } + subset_tpf +} + +fn check_orientation( + parsed: std::option::Option, + orientation: String, +) -> String { + if orientation == "MINUS" { + let start = Position::try_from(1).unwrap(); + let parse_orientation = parsed.unwrap(); + let compliment: Sequence = parse_orientation + .complement() + .collect::>() + .unwrap(); + let seq = compliment.get(start..).unwrap(); + str::from_utf8(seq).unwrap().chars().rev().collect() + } else { + let start = Position::try_from(1).unwrap(); + let parse_orientation = parsed.unwrap(); + let seq = parse_orientation.get(start..).unwrap(); + str::from_utf8(seq).unwrap().chars().collect() + } +} + +fn parse_seq( + sequence: std::option::Option, + tpf: Vec<&Tpf>, +) -> Vec { + let mut subset_tpf: Vec = Vec::new(); + // + // Take the input sequence and scaffold name + // Parse the input sequence based on the data contained in + // the TPF. Which is already a subset based on scaff name + // + + let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence () + for &i in &tpf { + let start = Position::try_from(i.start_coord).unwrap(); + let end = Position::try_from(i.end_coord).unwrap(); + //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap()); + let parsed = new_seq.slice(start..=end); + let the_sequence = check_orientation(parsed, i.orientation.to_owned()); + let data = NewFasta { + tpf: i.to_owned(), + sequence: the_sequence, + }; + subset_tpf.push(data); + } + subset_tpf +} + +fn get_uniques(tpf_list: &Vec) -> Vec { + let mut uniques: Vec = Vec::new(); + + for i in tpf_list { + if !uniques.contains(&i.new_scaffold) { + uniques.push(i.new_scaffold.to_owned()) + } + } + uniques +} + +fn save_to_fasta( + fasta_data: Vec, + tpf_data: Vec, + output: &String, + n_length: usize, +) { + // + // TPF is in the input TPF order, this will continue to be the case until + // the script is modified and the Tpf struct gets modified in place for some reason + // + let _data_file = File::create(output); + let mut file = OpenOptions::new() + .write(true) + .open(output) + .expect("creation failed"); + + let _debugger = File::create("debug.txt"); + let mut file2 = OpenOptions::new() + .write(true) + .open("debug.txt") + .expect("creation failed"); + + let uniques = get_uniques(&tpf_data); + + // This is inefficient as we are scanning through the fasta_data, uniques number of times + // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total. + let mut no_more: Vec = Vec::new(); + for x in uniques { + println!("NOW WRITING DATA FOR: {:?}", &x); + // X = "SUPER_1" + let stringy = format!(">{x}\n"); + file.write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + file2 + .write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + + let mut data: MyRecord = MyRecord { + name: "".to_string(), + sequence: Vec::new(), + }; + + no_more.push(x.to_owned()); + x.clone_into(&mut data.name); + for tpf in &tpf_data { + if tpf.new_scaffold == x { + for fasta in &fasta_data { + if fasta.tpf == *tpf { + let stringy = format!("\t{}\n", tpf); + file2 + .write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + data.sequence.push(fasta.sequence.to_owned()); + } + } + } + } + + let line_len: usize = 60; + let fixed = data.sequence; + let n_string = "N".repeat(n_length); + let fixed2 = fixed.join(&n_string); //.join required a borrowed str + let fixed3 = fixed2 + .as_bytes() + .chunks(line_len) + .map(str::from_utf8) + .collect::, _>>() + .unwrap(); + + for i in fixed3 { + let formatted = i.to_owned() + "\n"; + file.write_all(formatted.as_bytes()).unwrap(); + } + println!("NO LONG SCANNING FOR: {:?}", &no_more) + } +} + +#[allow(clippy::needless_borrow)] +#[allow(clippy::let_and_return)] +pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: &String, n_length: &usize) { + // + // Generate a curated fasta file based on the input TPF file + // which was generated by Pretext and the agp_to_tpf script. + // This new fasta file contains a new scaffold naming as well + // as pieced together sequences generated by the splitting of + // data in Pretext. + // + println!("LET'S GET CURATING THAT FASTA!"); + stacker::maybe_grow(32 * 1024, 1024 * 5120, || { + match validate_fasta(fasta_file) { + Ok(fasta_d) => { + let tpf_data = parse_tpf(&tpf_file); + //let _validated = varify_validity(&tpf_data, &fasta_d); + + // + // Start indexed reader of the input fasta + // if valid then use the data + // + let reader = + fasta::indexed_reader::Builder::default().build_from_path(fasta_file); + let fasta_repo = match reader { + Ok(data) => { + let adapter = IndexedReader::new(data); + let repository = fasta::Repository::new(adapter); + repository + } + Err(_) => todo!(), + }; + + // + // For unique scaffold in the fasta file iter through and + // parse sequence for each line in the tpf + // The tpf will contain multiple enteries for each scaffold, minimum of one entry. + // + let mut new_fasta_data: Vec = Vec::new(); + for i in fasta_d { + let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1)); + let sequence = fasta_repo.get(&i.0).transpose(); + + match sequence { + Ok(data) => { + let subset_results = parse_seq(data, subset_tpf); + new_fasta_data.extend(subset_results); + } + Err(e) => panic!("{:?}", e), + }; + } + save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned()) + } + Err(e) => panic!("Something is wrong with the file! | {}", e), + } + }) +} + diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs new file mode 100644 index 0000000..d2a08f4 --- /dev/null +++ b/src/processors/yaml_validator.rs @@ -0,0 +1,265 @@ +use std::fs::{self, File}; +use std::io::ErrorKind; +use std::path::PathBuf; + +use colored::Colorize; +use csv::Error; +use csv::ReaderBuilder; +use noodles::fasta; +use serde::{Deserialize, Serialize}; + +// Would be nice if there was a simple format_check +// use noodles::cram as cram; + +#[derive(Debug, Serialize, Deserialize)] +struct TreeValYaml { + assembly: Assembly, + reference_file: String, + assem_reads: AssemReads, + alignment: Alignment, + self_comp: SelfComp, + intron: Intron, + telomere: Telomere, + synteny: Synteny, + busco: Busco, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Assembly { + level: String, + sample_id: String, + latin_name: String, + classT: String, + asmVersion: u16, + gevalType: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct AssemReads { + pacbio: String, + hic: String, + supplementary: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Alignment { + data_dir: String, + common_name: String, + geneset: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct SelfComp { + motif_len: u16, + mummer_chunk: u16, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Intron { + size: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Telomere { + teloseq: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Synteny { + synteny_genome_path: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Busco { + lineages_path: String, + lineage: String, +} + +// +// CSV STRUCT +// +//#[derive(Deserialize)] +//struct Record { +// org: String, +// type: String, +// data_file: String +//} + +pub fn validate_paths(path: &str, field_id: &str) { + match fs::metadata(path) { + Ok(_) => { + println!( + "{}{} \t{}\t{}", + ">-".green(), + &field_id.green(), + "| PATH EXISTS: ".green(), + path.green() + ); + match field_id { + "REFERENCE" => validate_fasta(path), + "GENESET-CSV" => { + _ = validate_csv(path); + } + "HIC" => {} + _ => println!("Error"), + } + } + Err(_) => println!( + "{}{} \t{}\t{}", + "<-".red().bold(), + &field_id.red().bold(), + "| CHECK YAML!:".red().bold(), + path + ), + } +} + +pub fn validate_fasta(path: &str) { + let reader = fasta::reader::Builder.build_from_path(path); + + let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS"); + let result = binding.records(); + let counter = result.count(); + println!( + "{} {} {}", + ">- REFERENCE H/S PAIRS:".green(), + counter, + "H/S PAIRS".green() + ) +} + +pub fn validate_csv(path: &str) -> Result<(), Error> { + let file = File::open(path)?; + + let mut reader = ReaderBuilder::new() + .has_headers(true) + .delimiter(b',') + .from_reader(file); + + let record = reader.records().count(); + println!( + "{} {} {}", + ">-GENESET-RECORD-COUNT: >".green(), + record, + "<".green() + ); + + Ok(()) +} + +// +// FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta +// could make this much easier and consise by passing in a list of file types to check +// validatedata(path, [fa, fna, fasta]) +// +pub fn validate_data(path: &str, dtype: &str) { + match fs::read_dir(path) { + Err(e) if e.kind() == ErrorKind::NotFound => {} + Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()), + Ok(data_files) => { + if dtype == "pacbio" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "fasta.gz", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO PACBIO DATA FILES".red()) + } else { + println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); + } + } else if dtype == "hic" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "cram" || ex == "crai", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO HIC DATA FILES".red()) + } else { + println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); + } + } else if dtype == "synteny" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO SYNTENIC GENOMES".red()) + } else { + println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files); + } + } + } + }; +} + +pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { + + println! {"Validating Yaml: {}", file.purple()}; + + let input = fs::File::open(file).expect("Unable to read from file"); + let contents: TreeValYaml = + serde_yaml::from_reader(input).expect("Unable to read from file"); + + println!( + "RUNNING VALIDATE-YAML FOR SAMPLE: {}", + contents.assembly.sample_id.purple() + ); + + validate_paths(&contents.reference_file, "REFERENCE"); + validate_paths(&contents.alignment.data_dir, "GENESET"); + validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY"); + validate_paths(&contents.busco.lineages_path, "BUSCO"); + + validate_paths(&contents.assem_reads.pacbio, "PACBIO"); + validate_data(&contents.assem_reads.pacbio, "pacbio"); + + validate_paths(&contents.assem_reads.hic, "HIC"); + validate_data(&contents.assem_reads.hic, "hic"); + + println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue()); + let genesets = contents.alignment.geneset.split(','); + for set in genesets { + let gene_alignment_path = contents.alignment.data_dir.clone() + + &contents.assembly.classT + + "/csv_data/" + + set + + "-data.csv"; + validate_paths(&gene_alignment_path, "GENESET-CSV"); + } + + println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue()); + let synteny_full = + contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/"; + validate_paths(&synteny_full, "SYNTENY-FASTA"); + validate_data(&synteny_full, "synteny"); + + println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue()); + let busco_path = + contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; + validate_paths(&busco_path, "BUSCO-DB"); + // NOW CHECK FOR FILES IN DIRECTORY? + + println!( + "{}\n{}\n{}\n{}\n{}", + "VALIDATION COMPLETE".purple().bold(), + "GENERAL INFORMATION:".purple().bold(), + "Check the log to see what failed".bold(), + "FULL : ONLY synteny fails are permitted".purple(), + "RAPID: geneset, busco and synteny fails are permitted".purple() + ); +} diff --git a/src/remap_head.rs b/src/remap_head.rs deleted file mode 100644 index 83e20ef..0000000 --- a/src/remap_head.rs +++ /dev/null @@ -1,73 +0,0 @@ -pub mod remapping_headers { - use crate::map_headers; - use clap::ArgMatches; - use colored::Colorize; - use std::fs::File; - use std::io::{BufRead, BufReader}; - use std::iter::Zip; - - use crate::generics::validate_fasta; - - pub fn pull_map_from_tsv( - map_file: &str, - ) -> Zip, std::vec::IntoIter> { - let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE"); - let buff_reader: BufReader = BufReader::new(file_reader); - - let mut old_head: Vec = Vec::new(); - let mut new_head: Vec = Vec::new(); - - for line in buff_reader.lines() { - match line { - Ok(string) => { - let mut old_new = string.split('\t'); - let x = old_new.next().unwrap(); - let y = old_new.next().unwrap(); - old_head.push(x.to_string()); - new_head.push(y.to_string()); - } - Err(_) => { - print!("") - } - }; - } - - let mapped_heads: Zip, std::vec::IntoIter> = - new_head.into_iter().zip(old_head); - - mapped_heads - } - - pub fn remapping_head(arguments: std::option::Option<&ArgMatches>) { - let file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - let map_file: &String = arguments.unwrap().get_one::("map-file").unwrap(); - let output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - println!("Mapping headers for file: {}", file); - println!("Replace headers with string: {}", map_file); - - match validate_fasta(file) { - Ok(_thing) => { - let new_map: Zip, std::vec::IntoIter> = - pull_map_from_tsv(map_file); - - let new_fasta: String = format!("{output}_OH.fasta"); - - map_headers::mapping_headers::create_mapped_fasta(file, &new_fasta, new_map); - - println!( - "{}\n{}\n\t{}\n", - "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(), - "FOUND HERE:".green(), - &new_fasta.green() - ); - } - Err(_) => { - println!("NOT A VALID FASTA") - } - }; - } -} diff --git a/src/split_by_count.rs b/src/split_by_count.rs deleted file mode 100644 index 1396f00..0000000 --- a/src/split_by_count.rs +++ /dev/null @@ -1,111 +0,0 @@ -pub mod split_by_count_mod { - use crate::generics::sanitise_header; - use clap::ArgMatches; - use compare::{natural, Compare}; - use noodles::fasta::{self, Record}; - use std::cmp::Ordering; - use std::fs::OpenOptions; - use std::{ - fs::{create_dir_all, File}, - io::BufReader, - path::Path, - }; - - #[allow(clippy::needless_return)] - fn fix_head(records: Record, sanitise: bool) -> Record { - if sanitise { - let header = sanitise_header(records.definition()); - let definition = fasta::record::Definition::new(header, None); - let seq = records.sequence().to_owned(); - return fasta::Record::new(definition, seq); - } else { - return records.to_owned(); - }; - } - - fn write_fasta(outdir: &String, fasta_record: &Vec) { - println!("{}", outdir); - - let _data_file = File::create(outdir); - let file = OpenOptions::new() - .append(true) - .open(outdir) - .expect("creation failed"); - - let mut writer = fasta::Writer::new(file); - for i in fasta_record { - writer.write_record(i).unwrap(); - } - } - - pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) { - let sanitise: &bool = arguments.unwrap().get_one::("sanitise").unwrap(); - let fasta_file = arguments.unwrap().get_one::("fasta-file").unwrap(); - let path_obj = Path::new(fasta_file); - let grab_name = path_obj.file_name().unwrap(); - let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); - let actual_name = actual_list[0]; - - let data_type = arguments.unwrap().get_one::("data_type").unwrap(); - - let outpath = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type); - create_dir_all(new_outpath.clone()).unwrap(); - let fasta_count = arguments.unwrap().get_one::("count").unwrap(); - println!( - "Fasta file for processing: {:?}\nNumber of records per file: {:?}", - fasta_file, fasta_count - ); - - let mut counter: u16 = 0; - let mut file_counter: u16 = 1; - - let file_name: Vec<&str> = actual_name.split('.').collect(); - - let mut reader = File::open(fasta_file) - .map(BufReader::new) - .map(fasta::Reader::new) - .unwrap(); - - let mut record_list: Vec = Vec::new(); - for result in reader.records() { - let record = result.unwrap(); - counter += 1; - - let final_rec = fix_head(record, *sanitise); - record_list.push(final_rec); - - let cmp = natural(); - let compared = cmp.compare(&counter, fasta_count); - if compared == Ordering::Equal { - let full_outpath = format!( - "{}{}_f{}_c{}-a{}.fa", - new_outpath, - file_name[0], - file_counter, - &fasta_count, - &record_list.len() - ); - - write_fasta(&full_outpath, &record_list); - file_counter += 1; - counter = 0; - record_list = Vec::new(); - } - } - - let full_outpath = format!( - "{}{}_f{}_c{}-a{}.fa", - new_outpath, - file_name[0], - file_counter, - &fasta_count, - &record_list.len() - ); - write_fasta(&full_outpath, &record_list); - } -} diff --git a/src/split_by_size.rs b/src/split_by_size.rs deleted file mode 100644 index f1b4a7b..0000000 --- a/src/split_by_size.rs +++ /dev/null @@ -1,12 +0,0 @@ -pub mod split_by_size_mod { - use clap::ArgMatches; - - pub fn split_file_by_size(arguments: std::option::Option<&ArgMatches>) { - let fasta_file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - println!("Fasta file for processing: {:?}", &fasta_file); - println!( - "Size to chunk fasta into: {:?}", - arguments.unwrap().get_one::("mem-size").unwrap() - ); - } -} diff --git a/src/tpf_fasta.rs b/src/tpf_fasta.rs deleted file mode 100644 index fc5ec7e..0000000 --- a/src/tpf_fasta.rs +++ /dev/null @@ -1,277 +0,0 @@ -pub mod tpf_fasta_mod { - use clap::ArgMatches; - use noodles::core::Position; - use noodles::fasta; - use noodles::fasta::record::Sequence; - use noodles::fasta::repository::adapters::IndexedReader; - use std::fs::OpenOptions; - use std::io::Write; - use std::{fs::read_to_string, fs::File, str}; - - use crate::generics::validate_fasta; - - #[derive(Debug, Clone, PartialEq, Eq)] - struct Tpf { - ori_scaffold: String, - start_coord: usize, - end_coord: usize, - new_scaffold: String, - orientation: String, - } - - impl std::fmt::Display for Tpf { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - write!( - fmt, - "\t{} -- {} -- {}", - self.ori_scaffold, self.start_coord, self.end_coord - ) - } - } - - #[derive(Debug, PartialEq, Eq)] - struct NewFasta { - tpf: Tpf, - sequence: String, - } - - #[derive(Debug)] - struct MyRecord { - name: String, - sequence: Vec, - } - - fn parse_tpf(path: &String) -> Vec { - let mut all_tpf: Vec = Vec::new(); - for line in read_to_string(path).unwrap().lines() { - if line.starts_with('?') { - let line_replaced = line.replace('\t', " "); - let line_list: Vec<&str> = line_replaced.split_whitespace().collect(); - let scaff_data: Vec<&str> = line_list[1].split(':').collect(); - let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect(); - let data = Tpf { - ori_scaffold: scaff_data[0].to_owned(), - start_coord: scaff_coords[0].to_owned().parse::().unwrap(), - end_coord: scaff_coords[1].to_owned().parse::().unwrap(), - new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"), - orientation: line_list[3].to_owned(), - }; - all_tpf.push(data); - } - } - all_tpf - } - - fn subset_vec_tpf<'a>( - tpf: &'a Vec, - fasta: (&std::string::String, &usize), - ) -> Vec<&'a Tpf> { - // - // Subset the Vec based on a search through the fasta - // - let mut subset_tpf: Vec<&Tpf> = Vec::new(); - for i in tpf { - if i.ori_scaffold == *fasta.0 { - subset_tpf.push(i) - } - } - subset_tpf - } - - fn check_orientation( - parsed: std::option::Option, - orientation: String, - ) -> String { - if orientation == "MINUS" { - let start = Position::try_from(1).unwrap(); - let parse_orientation = parsed.unwrap(); - let compliment: Sequence = parse_orientation - .complement() - .collect::>() - .unwrap(); - let seq = compliment.get(start..).unwrap(); - str::from_utf8(seq).unwrap().chars().rev().collect() - } else { - let start = Position::try_from(1).unwrap(); - let parse_orientation = parsed.unwrap(); - let seq = parse_orientation.get(start..).unwrap(); - str::from_utf8(seq).unwrap().chars().collect() - } - } - - fn parse_seq( - sequence: std::option::Option, - tpf: Vec<&Tpf>, - ) -> Vec { - let mut subset_tpf: Vec = Vec::new(); - // - // Take the input sequence and scaffold name - // Parse the input sequence based on the data contained in - // the TPF. Which is already a subset based on scaff name - // - - let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence () - for &i in &tpf { - let start = Position::try_from(i.start_coord).unwrap(); - let end = Position::try_from(i.end_coord).unwrap(); - //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap()); - let parsed = new_seq.slice(start..=end); - let the_sequence = check_orientation(parsed, i.orientation.to_owned()); - let data = NewFasta { - tpf: i.to_owned(), - sequence: the_sequence, - }; - subset_tpf.push(data); - } - subset_tpf - } - - fn get_uniques(tpf_list: &Vec) -> Vec { - let mut uniques: Vec = Vec::new(); - - for i in tpf_list { - if !uniques.contains(&i.new_scaffold) { - uniques.push(i.new_scaffold.to_owned()) - } - } - uniques - } - - fn save_to_fasta( - fasta_data: Vec, - tpf_data: Vec, - output: &String, - n_length: usize, - ) { - // - // TPF is in the input TPF order, this will continue to be the case until - // the script is modified and the Tpf struct gets modified in place for some reason - // - let _data_file = File::create(output); - let mut file = OpenOptions::new() - .write(true) - .open(output) - .expect("creation failed"); - - let _debugger = File::create("debug.txt"); - let mut file2 = OpenOptions::new() - .write(true) - .open("debug.txt") - .expect("creation failed"); - - let uniques = get_uniques(&tpf_data); - - // This is inefficient as we are scanning through the fasta_data, uniques number of times - // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total. - let mut no_more: Vec = Vec::new(); - for x in uniques { - println!("NOW WRITING DATA FOR: {:?}", &x); - // X = "SUPER_1" - let stringy = format!(">{x}\n"); - file.write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - file2 - .write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - - let mut data: MyRecord = MyRecord { - name: "".to_string(), - sequence: Vec::new(), - }; - - no_more.push(x.to_owned()); - x.clone_into(&mut data.name); - for tpf in &tpf_data { - if tpf.new_scaffold == x { - for fasta in &fasta_data { - if fasta.tpf == *tpf { - let stringy = format!("\t{}\n", tpf); - file2 - .write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - data.sequence.push(fasta.sequence.to_owned()); - } - } - } - } - - let line_len: usize = 60; - let fixed = data.sequence; - let n_string = "N".repeat(n_length); - let fixed2 = fixed.join(&n_string); //.join required a borrowed str - let fixed3 = fixed2 - .as_bytes() - .chunks(line_len) - .map(str::from_utf8) - .collect::, _>>() - .unwrap(); - - for i in fixed3 { - let formatted = i.to_owned() + "\n"; - file.write_all(formatted.as_bytes()).unwrap(); - } - println!("NO LONG SCANNING FOR: {:?}", &no_more) - } - } - - #[allow(clippy::needless_borrow)] - #[allow(clippy::let_and_return)] - pub fn curate_fasta(arguments: std::option::Option<&ArgMatches>) { - // - // Generate a curated fasta file based on the input TPF file - // which was generated by Pretext and the agp_to_tpf script. - // This new fasta file contains a new scaffold naming as well - // as pieced together sequences generated by the splitting of - // data in Pretext. - // - let fasta_file: &String = arguments.unwrap().get_one::("fasta").unwrap(); - let tpf_file: &String = arguments.unwrap().get_one::("tpf").unwrap(); - let n_length: &usize = arguments.unwrap().get_one::("n_length").unwrap(); - let output: &String = arguments.unwrap().get_one::("output").unwrap(); - println!("LET'S GET CURATING THAT FASTA!"); - stacker::maybe_grow(32 * 1024, 1024 * 5120, || { - match validate_fasta(fasta_file) { - Ok(fasta_d) => { - let tpf_data = parse_tpf(&tpf_file); - //let _validated = varify_validity(&tpf_data, &fasta_d); - - // - // Start indexed reader of the input fasta - // if valid then use the data - // - let reader = - fasta::indexed_reader::Builder::default().build_from_path(fasta_file); - let fasta_repo = match reader { - Ok(data) => { - let adapter = IndexedReader::new(data); - let repository = fasta::Repository::new(adapter); - repository - } - Err(_) => todo!(), - }; - - // - // For unique scaffold in the fasta file iter through and - // parse sequence for each line in the tpf - // The tpf will contain multiple enteries for each scaffold, minimum of one entry. - // - let mut new_fasta_data: Vec = Vec::new(); - for i in fasta_d { - let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1)); - let sequence = fasta_repo.get(&i.0).transpose(); - - match sequence { - Ok(data) => { - let subset_results = parse_seq(data, subset_tpf); - new_fasta_data.extend(subset_results); - } - Err(e) => panic!("{:?}", e), - }; - } - save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned()) - } - Err(e) => panic!("Something is wrong with the file! | {}", e), - } - }) - } -} diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs deleted file mode 100644 index e23d121..0000000 --- a/src/yaml_validator.rs +++ /dev/null @@ -1,272 +0,0 @@ -pub mod yaml_validator_mod { - use clap::ArgMatches; - use colored::Colorize; - use csv::Error; - use csv::ReaderBuilder; - use noodles::fasta; - use serde::{Deserialize, Serialize}; - use std::fs::{self, File}; - use std::io::ErrorKind; - use std::path::PathBuf; - // Would be nice if there was a simple format_check - // use noodles::cram as cram; - - #[derive(Debug, Serialize, Deserialize)] - struct TreeValYaml { - assembly: Assembly, - reference_file: String, - assem_reads: AssemReads, - alignment: Alignment, - self_comp: SelfComp, - intron: Intron, - telomere: Telomere, - synteny: Synteny, - busco: Busco, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Assembly { - level: String, - sample_id: String, - latin_name: String, - classT: String, - asmVersion: u16, - gevalType: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct AssemReads { - pacbio: String, - hic: String, - supplementary: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Alignment { - data_dir: String, - common_name: String, - geneset: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct SelfComp { - motif_len: u16, - mummer_chunk: u16, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Intron { - size: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Telomere { - teloseq: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Synteny { - synteny_genome_path: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Busco { - lineages_path: String, - lineage: String, - } - - // - // CSV STRUCT - // - //#[derive(Deserialize)] - //struct Record { - // org: String, - // type: String, - // data_file: String - //} - - pub fn validate_paths(path: &str, field_id: &str) { - match fs::metadata(path) { - Ok(_) => { - println!( - "{}{} \t{}\t{}", - ">-".green(), - &field_id.green(), - "| PATH EXISTS: ".green(), - path.green() - ); - match field_id { - "REFERENCE" => validate_fasta(path), - "GENESET-CSV" => { - _ = validate_csv(path); - } - "HIC" => {} - _ => println!("Error"), - } - } - Err(_) => println!( - "{}{} \t{}\t{}", - "<-".red().bold(), - &field_id.red().bold(), - "| CHECK YAML!:".red().bold(), - path - ), - } - } - - pub fn validate_fasta(path: &str) { - let reader = fasta::reader::Builder.build_from_path(path); - - let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS"); - let result = binding.records(); - let counter = result.count(); - println!( - "{} {} {}", - ">- REFERENCE H/S PAIRS:".green(), - counter, - "H/S PAIRS".green() - ) - } - - pub fn validate_csv(path: &str) -> Result<(), Error> { - let file = File::open(path)?; - - let mut reader = ReaderBuilder::new() - .has_headers(true) - .delimiter(b',') - .from_reader(file); - - let record = reader.records().count(); - println!( - "{} {} {}", - ">-GENESET-RECORD-COUNT: >".green(), - record, - "<".green() - ); - - Ok(()) - } - - // - // FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta - // could make this much easier and consise by passing in a list of file types to check - // validatedata(path, [fa, fna, fasta]) - // - pub fn validate_data(path: &str, dtype: &str) { - match fs::read_dir(path) { - Err(e) if e.kind() == ErrorKind::NotFound => {} - Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()), - Ok(data_files) => { - if dtype == "pacbio" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "fasta.gz", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO PACBIO DATA FILES".red()) - } else { - println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); - } - } else if dtype == "hic" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "cram" || ex == "crai", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO HIC DATA FILES".red()) - } else { - println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); - } - } else if dtype == "synteny" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO SYNTENIC GENOMES".red()) - } else { - println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files); - } - } - } - }; - } - - pub fn validate_yaml(arguments: std::option::Option<&ArgMatches>) { - let file = arguments.unwrap().get_one::("yaml").unwrap(); - let _output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - let _verbose_flag: &bool = arguments.unwrap().get_one::("verbose").unwrap(); - - println! {"Validating Yaml: {}", file.purple()}; - - let input = fs::File::open(file).expect("Unable to read from file"); - let contents: TreeValYaml = - serde_yaml::from_reader(input).expect("Unable to read from file"); - - println!( - "RUNNING VALIDATE-YAML FOR SAMPLE: {}", - contents.assembly.sample_id.purple() - ); - - validate_paths(&contents.reference_file, "REFERENCE"); - validate_paths(&contents.alignment.data_dir, "GENESET"); - validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY"); - validate_paths(&contents.busco.lineages_path, "BUSCO"); - - validate_paths(&contents.assem_reads.pacbio, "PACBIO"); - validate_data(&contents.assem_reads.pacbio, "pacbio"); - - validate_paths(&contents.assem_reads.hic, "HIC"); - validate_data(&contents.assem_reads.hic, "hic"); - - println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue()); - let genesets = contents.alignment.geneset.split(','); - for set in genesets { - let gene_alignment_path = contents.alignment.data_dir.clone() - + &contents.assembly.classT - + "/csv_data/" - + set - + "-data.csv"; - validate_paths(&gene_alignment_path, "GENESET-CSV"); - } - - println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue()); - let synteny_full = - contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/"; - validate_paths(&synteny_full, "SYNTENY-FASTA"); - validate_data(&synteny_full, "synteny"); - - println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue()); - let busco_path = - contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; - validate_paths(&busco_path, "BUSCO-DB"); - // NOW CHECK FOR FILES IN DIRECTORY? - - println!( - "{}\n{}\n{}\n{}\n{}", - "VALIDATION COMPLETE".purple().bold(), - "GENERAL INFORMATION:".purple().bold(), - "Check the log to see what failed".bold(), - "FULL : ONLY synteny fails are permitted".purple(), - "RAPID: geneset, busco and synteny fails are permitted".purple() - ); - } -} From 6baebd18aeeae0d91aa154a996389ac4ab44c38a Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:27:47 +0100 Subject: [PATCH 02/30] Adding unit test structure --- src/processors/exclude_seq.rs | 11 +++++++++++ src/processors/map_headers.rs | 11 +++++++++++ src/processors/remap_head.rs | 10 ++++++++++ src/processors/split_by_count.rs | 11 +++++++++++ src/processors/split_by_size.rs | 10 ++++++++++ src/processors/tpf_fasta.rs | 11 +++++++++++ src/processors/yaml_validator.rs | 11 +++++++++++ 7 files changed, 75 insertions(+) diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs index 89ecc8d..84251a2 100644 --- a/src/processors/exclude_seq.rs +++ b/src/processors/exclude_seq.rs @@ -38,3 +38,14 @@ pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) { let _x = open_fasta(list_to_exclude, fasta, outfile); } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} + diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs index 18a9847..3fa1101 100644 --- a/src/processors/map_headers.rs +++ b/src/processors/map_headers.rs @@ -129,3 +129,14 @@ pub fn map_fasta_head( Ok(()) } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} + diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs index 44ef5d5..c4f53ae 100644 --- a/src/processors/remap_head.rs +++ b/src/processors/remap_head.rs @@ -64,3 +64,13 @@ pub fn remapping_head(file: &String, output: &String, map_file: &String) { }; } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs index dda0b39..a220b14 100644 --- a/src/processors/split_by_count.rs +++ b/src/processors/split_by_count.rs @@ -98,3 +98,14 @@ pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_ write_fasta(&full_outpath, &record_list); } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} + diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs index 5a65719..c6452dd 100644 --- a/src/processors/split_by_size.rs +++ b/src/processors/split_by_size.rs @@ -8,3 +8,13 @@ pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: ); } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs index f8863b3..9052b6f 100644 --- a/src/processors/tpf_fasta.rs +++ b/src/processors/tpf_fasta.rs @@ -270,3 +270,14 @@ pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: }) } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} + diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs index d2a08f4..a738f2e 100644 --- a/src/processors/yaml_validator.rs +++ b/src/processors/yaml_validator.rs @@ -263,3 +263,14 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { "RAPID: geneset, busco and synteny fails are permitted".purple() ); } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} From dd524f6b593ee71ef765f75c113d65d7735d701e Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:32:42 +0100 Subject: [PATCH 03/30] Cargo format --- src/cli/mod.rs | 26 +++++----- src/main.rs | 86 +++++++++++++++++++------------- src/processors.rs | 8 +-- src/processors/exclude_seq.rs | 1 - src/processors/map_headers.rs | 16 ++---- src/processors/remap_head.rs | 1 - src/processors/split_by_count.rs | 9 +++- src/processors/split_by_size.rs | 5 +- src/processors/tpf_fasta.rs | 26 ++++------ src/processors/yaml_validator.rs | 7 +-- 10 files changed, 94 insertions(+), 91 deletions(-) diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 74d50ef..80dc8e5 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -1,5 +1,5 @@ -use clap::{Parser, Subcommand}; use clap::builder::Str; +use clap::{Parser, Subcommand}; const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"]; @@ -10,7 +10,7 @@ pub struct Cli { // command is optional (TODO: Make this not optional) // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html#defaults #[command(subcommand)] - pub command: Option + pub command: Option, } // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html @@ -27,11 +27,10 @@ pub enum Commands { // Output the log to file #[arg(short = 'o', long, default_value_t=String::from("./"))] - output: String + output: String, }, SplitByCount { - // A path to a valid fasta file. #[arg(short = 'f', long)] fasta_file: String, @@ -42,7 +41,7 @@ pub enum Commands { // The data type of the input data #[arg(short = 'd', value_parser = clap::builder::PossibleValuesParser::new(SPLIT_OPTIONS))] - data_type: String , + data_type: String, // Do we need to sanitise the headers of the input fasta #[arg(short = 's', value_parser = clap::value_parser!(bool))] @@ -87,7 +86,7 @@ pub enum Commands { output_directory: String, #[arg(short = 'r', default_value_t = String::from("FMMH"))] - replace_with: String + replace_with: String, }, ReMapHeaders { @@ -101,7 +100,7 @@ pub enum Commands { // "The original mapped header field, a TSV of old-header, new-header #[arg(short = 'm', default_value_t = String::from("FMMH"))] - map_file: String + map_file: String, }, #[command(version, about="Profile an input fasta file and return various statistics", long_about = None)] @@ -112,7 +111,7 @@ pub enum Commands { // The input fasta file for profiling #[arg(short = 'o', long, default_value_t = String::from("FasMan-out"))] - output_dir: String + output_dir: String, }, Curate { @@ -133,7 +132,7 @@ pub enum Commands { // Length that the N (gap) string should be. #[arg(short, long, default_value_t = 200)] - n_length: usize + n_length: usize, }, Subset { @@ -147,7 +146,7 @@ pub enum Commands { // Percentage of the original file entries that should be retained #[arg(short = 'p', long, default_value_t = 50)] - percent: u16 + percent: u16, }, FilterFasta { @@ -160,11 +159,10 @@ pub enum Commands { output: String, #[arg(short = 'l', long = "filter_list")] - filter_list: String + filter_list: String, }, Mergehaps { - // The input fasta file for re-organising #[arg(short = 'p', long)] fasta_1: String, @@ -180,5 +178,5 @@ pub enum Commands { // Output file prefix #[arg(short = 'o', default_value_t = String::from("merged"))] output: String, - } -} \ No newline at end of file + }, +} diff --git a/src/main.rs b/src/main.rs index 0f635ac..a450cc0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,52 +12,70 @@ mod generics; //use crate::generics::validate_fasta; // Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html -use crate::processors::yaml_validator::validate_yaml; -use crate::processors::split_by_count::split_file_by_count; -use crate::processors::split_by_size::split_file_by_size; +use crate::processors::exclude_seq::filter_fasta; use crate::processors::map_headers::map_fasta_head; use crate::processors::remap_head::remapping_head; +use crate::processors::split_by_count::split_file_by_count; +use crate::processors::split_by_size::split_file_by_size; use crate::processors::tpf_fasta::curate_fasta; -use crate::processors::exclude_seq::filter_fasta; +use crate::processors::yaml_validator::validate_yaml; mod processors; - fn main() -> Result<(), Error> { - let cli = Cli::parse(); match &cli.command { - Some(Commands::YamlValidator { yaml, verbose, output }) => { - validate_yaml(yaml, verbose, output) - }, - Some(Commands::SplitByCount { fasta_file, output_directory, data_type, sanitise, count}) => { - split_file_by_count( - fasta_file, output_directory, data_type, sanitise, count - ) - }, - Some(Commands::SplitBySize { fasta_file, mem_size, output_directory }) => { - split_file_by_size( - fasta_file, - mem_size, - output_directory - ) - }, - Some(Commands::MapHeaders { fasta_file, output_directory, replace_with }) => { - _ = map_fasta_head(fasta_file, output_directory, replace_with) - }, - Some(Commands::ReMapHeaders { fasta_file, output_directory, map_file }) => { - remapping_head(fasta_file, output_directory, map_file) + Some(Commands::YamlValidator { + yaml, + verbose, + output, + }) => validate_yaml(yaml, verbose, output), + Some(Commands::SplitByCount { + fasta_file, + output_directory, + data_type, + sanitise, + count, + }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count), + Some(Commands::SplitBySize { + fasta_file, + mem_size, + output_directory, + }) => split_file_by_size(fasta_file, mem_size, output_directory), + Some(Commands::MapHeaders { + fasta_file, + output_directory, + replace_with, + }) => _ = map_fasta_head(fasta_file, output_directory, replace_with), + Some(Commands::ReMapHeaders { + fasta_file, + output_directory, + map_file, + }) => remapping_head(fasta_file, output_directory, map_file), + Some(Commands::Curate { + fasta, + tpf, + sort, + output, + n_length, + }) => curate_fasta(fasta, tpf, sort, output, n_length), + Some(Commands::FilterFasta { + fasta, + output, + filter_list, + }) => filter_fasta(fasta, output, filter_list), + Some(Commands::GenesetCSVS { .. }) => { + todo!() + } + Some(Commands::Profile { .. }) => { + todo!() } - Some(Commands::Curate { fasta, tpf, sort, output, n_length}) => { - curate_fasta(fasta, tpf, sort, output, n_length) + Some(Commands::Subset { .. }) => { + todo!() } - Some(Commands::FilterFasta { fasta, output, filter_list }) => { - filter_fasta(fasta, output, filter_list) + Some(Commands::Mergehaps { .. }) => { + todo!() } - Some(Commands::GenesetCSVS { .. }) => { todo!() }, - Some(Commands::Profile { .. }) => { todo!() } - Some(Commands::Subset { .. }) => { todo!() } - Some(Commands::Mergehaps { .. }) => { todo!() } None => { panic!("No command given!") } diff --git a/src/processors.rs b/src/processors.rs index 0c1ad7f..dc0f572 100644 --- a/src/processors.rs +++ b/src/processors.rs @@ -1,7 +1,7 @@ -pub mod yaml_validator; -pub mod split_by_count; -pub mod split_by_size; +pub mod exclude_seq; pub mod map_headers; pub mod remap_head; +pub mod split_by_count; +pub mod split_by_size; pub mod tpf_fasta; -pub mod exclude_seq; +pub mod yaml_validator; diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs index 84251a2..6e04222 100644 --- a/src/processors/exclude_seq.rs +++ b/src/processors/exclude_seq.rs @@ -48,4 +48,3 @@ mod tests { assert_eq!(result, 4); } } - diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs index 3fa1101..8cf09dd 100644 --- a/src/processors/map_headers.rs +++ b/src/processors/map_headers.rs @@ -44,10 +44,7 @@ pub fn create_mapping( pub fn save_mapping( output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, + mapped: Zip, std::vec::IntoIter>, ) { let f: File = File::create(output).expect("Unable to create file"); let mut f: BufWriter = BufWriter::new(f); @@ -62,10 +59,7 @@ pub fn save_mapping( pub fn create_mapped_fasta( input: &str, output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, + mapped: Zip, std::vec::IntoIter>, ) { let file_reader: File = File::open(input).expect("CAN'T OPEN FILE"); let buff_reader: BufReader = BufReader::new(file_reader); @@ -91,9 +85,10 @@ pub fn create_mapped_fasta( } pub fn map_fasta_head( - file: &String, output: &String, replacer: &String + file: &String, + output: &String, + replacer: &String, ) -> Result<(), Box> { - println!("Mapping headers for file: {}", file); println!("Replace headers with string: {:?}", &replacer); @@ -139,4 +134,3 @@ mod tests { assert_eq!(result, 4); } } - diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs index c4f53ae..02cc633 100644 --- a/src/processors/remap_head.rs +++ b/src/processors/remap_head.rs @@ -38,7 +38,6 @@ pub fn pull_map_from_tsv( } pub fn remapping_head(file: &String, output: &String, map_file: &String) { - println!("Mapping headers for file: {}", file); println!("Replace headers with string: {}", map_file); diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs index a220b14..43bfa53 100644 --- a/src/processors/split_by_count.rs +++ b/src/processors/split_by_count.rs @@ -37,7 +37,13 @@ fn write_fasta(outdir: &String, fasta_record: &Vec) { } } -pub fn split_file_by_count(fasta_file: &String, output_directory: &String, data_type: &String, sanitise: &bool, fasta_count: &u16) { +pub fn split_file_by_count( + fasta_file: &String, + output_directory: &String, + data_type: &String, + sanitise: &bool, + fasta_count: &u16, +) { let path_obj = Path::new(fasta_file); let grab_name = path_obj.file_name().unwrap(); let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); @@ -108,4 +114,3 @@ mod tests { assert_eq!(result, 4); } } - diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs index c6452dd..1bf3e27 100644 --- a/src/processors/split_by_size.rs +++ b/src/processors/split_by_size.rs @@ -2,10 +2,7 @@ use clap::ArgMatches; pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) { println!("Fasta file for processing: {:?}", &fasta_file); - println!( - "Size to chunk fasta into: {:?}", - mem_size - ); + println!("Size to chunk fasta into: {:?}", mem_size); } #[cfg(test)] diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs index 9052b6f..e8c292b 100644 --- a/src/processors/tpf_fasta.rs +++ b/src/processors/tpf_fasta.rs @@ -1,6 +1,6 @@ -use std::{fs::File, fs::read_to_string, str}; use std::fs::OpenOptions; use std::io::Write; +use std::{fs::read_to_string, fs::File, str}; use noodles::core::Position; use noodles::fasta; @@ -61,10 +61,7 @@ fn parse_tpf(path: &String) -> Vec { all_tpf } -fn subset_vec_tpf<'a>( - tpf: &'a Vec, - fasta: (&std::string::String, &usize), -) -> Vec<&'a Tpf> { +fn subset_vec_tpf<'a>(tpf: &'a Vec, fasta: (&std::string::String, &usize)) -> Vec<&'a Tpf> { // // Subset the Vec based on a search through the fasta // @@ -136,12 +133,7 @@ fn get_uniques(tpf_list: &Vec) -> Vec { uniques } -fn save_to_fasta( - fasta_data: Vec, - tpf_data: Vec, - output: &String, - n_length: usize, -) { +fn save_to_fasta(fasta_data: Vec, tpf_data: Vec, output: &String, n_length: usize) { // // TPF is in the input TPF order, this will continue to be the case until // the script is modified and the Tpf struct gets modified in place for some reason @@ -215,7 +207,13 @@ fn save_to_fasta( #[allow(clippy::needless_borrow)] #[allow(clippy::let_and_return)] -pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: &String, n_length: &usize) { +pub fn curate_fasta( + fasta_file: &String, + tpf_file: &String, + sort: &bool, + output: &String, + n_length: &usize, +) { // // Generate a curated fasta file based on the input TPF file // which was generated by Pretext and the agp_to_tpf script. @@ -234,8 +232,7 @@ pub fn curate_fasta(fasta_file: &String, tpf_file: &String, sort: &bool, output: // Start indexed reader of the input fasta // if valid then use the data // - let reader = - fasta::indexed_reader::Builder::default().build_from_path(fasta_file); + let reader = fasta::indexed_reader::Builder::default().build_from_path(fasta_file); let fasta_repo = match reader { Ok(data) => { let adapter = IndexedReader::new(data); @@ -280,4 +277,3 @@ mod tests { assert_eq!(result, 4); } } - diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs index a738f2e..bf3abac 100644 --- a/src/processors/yaml_validator.rs +++ b/src/processors/yaml_validator.rs @@ -208,12 +208,10 @@ pub fn validate_data(path: &str, dtype: &str) { } pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { - println! {"Validating Yaml: {}", file.purple()}; let input = fs::File::open(file).expect("Unable to read from file"); - let contents: TreeValYaml = - serde_yaml::from_reader(input).expect("Unable to read from file"); + let contents: TreeValYaml = serde_yaml::from_reader(input).expect("Unable to read from file"); println!( "RUNNING VALIDATE-YAML FOR SAMPLE: {}", @@ -249,8 +247,7 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { validate_data(&synteny_full, "synteny"); println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue()); - let busco_path = - contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; + let busco_path = contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; validate_paths(&busco_path, "BUSCO-DB"); // NOW CHECK FOR FILES IN DIRECTORY? From d8f1b33db7aff2eb60370c3e91d8255e3331dc85 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:35:58 +0100 Subject: [PATCH 04/30] Clippy fixes --- src/cli/mod.rs | 2 +- src/main.rs | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 80dc8e5..e89e328 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -1,5 +1,5 @@ -use clap::builder::Str; use clap::{Parser, Subcommand}; +use clap::builder::Str; const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"]; diff --git a/src/main.rs b/src/main.rs index a450cc0..bad5f7f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,11 @@ #![allow(non_snake_case)] -use clap::{command, Arg, Command, Parser}; -use colored::Colorize; -use std::env; use std::io::Error; -mod cli; -use cli::{Cli, Commands}; +use clap::Parser; +use colored::Colorize; -mod generics; -//use crate::generics::validate_fasta; +use cli::{Cli, Commands}; // Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html use crate::processors::exclude_seq::filter_fasta; @@ -19,6 +15,11 @@ use crate::processors::split_by_count::split_file_by_count; use crate::processors::split_by_size::split_file_by_size; use crate::processors::tpf_fasta::curate_fasta; use crate::processors::yaml_validator::validate_yaml; + +mod cli; +mod generics; +//use crate::generics::validate_fasta; + mod processors; fn main() -> Result<(), Error> { From ae55838106e3c51727d9edbb7ef00496393c5d44 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:36:21 +0100 Subject: [PATCH 05/30] Clippy fixes --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ea8c4bf..2a0038a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.idea \ No newline at end of file From 06129eadbfd2f4da72e3a3233d505a5252a2d34a Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:37:39 +0100 Subject: [PATCH 06/30] Clippy fixes --- src/cli/mod.rs | 1 - src/main.rs | 1 - src/processors/exclude_seq.rs | 2 -- src/processors/map_headers.rs | 2 -- src/processors/remap_head.rs | 1 - src/processors/split_by_count.rs | 3 +-- src/processors/split_by_size.rs | 5 +---- src/processors/tpf_fasta.rs | 3 +-- src/processors/yaml_validator.rs | 3 +-- 9 files changed, 4 insertions(+), 17 deletions(-) diff --git a/src/cli/mod.rs b/src/cli/mod.rs index e89e328..2638ee9 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -1,5 +1,4 @@ use clap::{Parser, Subcommand}; -use clap::builder::Str; const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"]; diff --git a/src/main.rs b/src/main.rs index bad5f7f..d571da8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,6 @@ use std::io::Error; use clap::Parser; -use colored::Colorize; use cli::{Cli, Commands}; diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs index 6e04222..7866df4 100644 --- a/src/processors/exclude_seq.rs +++ b/src/processors/exclude_seq.rs @@ -1,4 +1,3 @@ -use clap::ArgMatches; use noodles::fasta; use std::error::Error; use std::{fs, io::BufRead, str}; @@ -40,7 +39,6 @@ pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) { #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs index 8cf09dd..9c3390f 100644 --- a/src/processors/map_headers.rs +++ b/src/processors/map_headers.rs @@ -1,4 +1,3 @@ -use clap::ArgMatches; use colored::Colorize; use std::error::Error; use std::fmt; @@ -126,7 +125,6 @@ pub fn map_fasta_head( #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs index 02cc633..ca30d10 100644 --- a/src/processors/remap_head.rs +++ b/src/processors/remap_head.rs @@ -65,7 +65,6 @@ pub fn remapping_head(file: &String, output: &String, map_file: &String) { #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs index 43bfa53..35ba1fe 100644 --- a/src/processors/split_by_count.rs +++ b/src/processors/split_by_count.rs @@ -1,5 +1,5 @@ use crate::generics::sanitise_header; -use clap::ArgMatches; + use compare::{natural, Compare}; use noodles::fasta::{self, Record}; use std::cmp::Ordering; @@ -106,7 +106,6 @@ pub fn split_file_by_count( #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs index 1bf3e27..8c126dc 100644 --- a/src/processors/split_by_size.rs +++ b/src/processors/split_by_size.rs @@ -1,13 +1,10 @@ -use clap::ArgMatches; - -pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, output_directory: &String) { +pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &String) { println!("Fasta file for processing: {:?}", &fasta_file); println!("Size to chunk fasta into: {:?}", mem_size); } #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs index e8c292b..0453798 100644 --- a/src/processors/tpf_fasta.rs +++ b/src/processors/tpf_fasta.rs @@ -210,7 +210,7 @@ fn save_to_fasta(fasta_data: Vec, tpf_data: Vec, output: &String, pub fn curate_fasta( fasta_file: &String, tpf_file: &String, - sort: &bool, + _sort: &bool, output: &String, n_length: &usize, ) { @@ -269,7 +269,6 @@ pub fn curate_fasta( #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs index bf3abac..0661e45 100644 --- a/src/processors/yaml_validator.rs +++ b/src/processors/yaml_validator.rs @@ -207,7 +207,7 @@ pub fn validate_data(path: &str, dtype: &str) { }; } -pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { +pub fn validate_yaml(file: &String, _verbose: &bool, _output: &String) { println! {"Validating Yaml: {}", file.purple()}; let input = fs::File::open(file).expect("Unable to read from file"); @@ -263,7 +263,6 @@ pub fn validate_yaml(file: &String, verbose: &bool, output: &String) { #[cfg(test)] mod tests { - use super::*; #[test] fn it_works() { From 011b5394033c5b309ee699816b757d5a5cd77569 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:40:16 +0100 Subject: [PATCH 07/30] Clippy fixes --- src/processors/exclude_seq.rs | 2 +- src/processors/split_by_size.rs | 2 +- src/processors/yaml_validator.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs index 7866df4..1b0b8c1 100644 --- a/src/processors/exclude_seq.rs +++ b/src/processors/exclude_seq.rs @@ -32,7 +32,7 @@ fn open_fasta<'a>( } } -pub fn filter_fasta(fasta: &String, outfile: &String, exclude: &String) { +pub fn filter_fasta(fasta: &str, outfile: &str, exclude: &str) { let list_to_exclude = exclude.split(',').collect::>(); let _x = open_fasta(list_to_exclude, fasta, outfile); } diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs index 8c126dc..6445afd 100644 --- a/src/processors/split_by_size.rs +++ b/src/processors/split_by_size.rs @@ -1,4 +1,4 @@ -pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &String) { +pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &str) { println!("Fasta file for processing: {:?}", &fasta_file); println!("Size to chunk fasta into: {:?}", mem_size); } diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs index 0661e45..f7a5172 100644 --- a/src/processors/yaml_validator.rs +++ b/src/processors/yaml_validator.rs @@ -207,7 +207,7 @@ pub fn validate_data(path: &str, dtype: &str) { }; } -pub fn validate_yaml(file: &String, _verbose: &bool, _output: &String) { +pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) { println! {"Validating Yaml: {}", file.purple()}; let input = fs::File::open(file).expect("Unable to read from file"); From c5b4ef97926460c618ae59ab4ae2fd7067f4e729 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 24 May 2024 17:43:12 +0100 Subject: [PATCH 08/30] Readme update (badge added) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 79c7001..6c478a2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # FastaManipulator +![img](https://github.com/Rust-Wellcome/FasMan/actions/workflows/release-repo.yml/badge.svg) + This is a re-write of the current fasta manipulation scripts I've written whilst at ToL, as well as adding some functionality needed for future projects. Currently, this program has the following arguments: From bdf8cd2442a4982cc51e59cc63909ceaa20f3c07 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sat, 25 May 2024 19:29:13 +0100 Subject: [PATCH 09/30] Updating module structure. --- src/{processors.rs => processors/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{processors.rs => processors/mod.rs} (100%) diff --git a/src/processors.rs b/src/processors/mod.rs similarity index 100% rename from src/processors.rs rename to src/processors/mod.rs From b1423b0f8e57d13a8a6971eea8038535bc78b08d Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sat, 25 May 2024 23:31:11 +0100 Subject: [PATCH 10/30] Adding file read func --- src/file_utils/file_utility.rs | 67 ++++++++++++++++++++++++++++++++++ src/file_utils/mod.rs | 1 + src/main.rs | 1 + 3 files changed, 69 insertions(+) create mode 100644 src/file_utils/file_utility.rs create mode 100644 src/file_utils/mod.rs diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs new file mode 100644 index 0000000..4cc5424 --- /dev/null +++ b/src/file_utils/file_utility.rs @@ -0,0 +1,67 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; + +use clap::Error; + +struct Records { + lines: Vec +} + +struct FileReader { + buffer: Vec // TODO: Make use of this internal buffer. +} + +pub trait Default { + fn default() -> Self; +} + +impl Default for FileReader { + fn default() -> Self { + FileReader { + buffer: Vec::::new() + } + } +} + +impl FileReader { + + /* + * Reads a specific number of lines from a file + */ + pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + // This buffer will be stored in heap, and will popped off when read_file function goes out of scope. + let mut internal_buffer = vec![]; + + // Error unwrapping: https://tinyurl.com/brt9fphk + // take() function https://tinyurl.com/6vx7m3k6 + for line in reader.lines().take(num_lines) { + let result = line.expect("Error in reading file"); // This will panic if errored + internal_buffer.push(result); + }; + + Ok(Records { lines: internal_buffer }) + } + +} + +#[cfg(test)] +mod tests { + use core::panic; + + use super::*; + + #[test] + fn read_first_line() { + let mut fileReader = FileReader::default(); + match fileReader.read_file("test_data/synthetic/tiny.fa", 3) { + Ok(records) => { + assert_eq!(3, records.lines.len()) + } + Err(error) => { + panic!("{:?}", error) + } + } + } +} \ No newline at end of file diff --git a/src/file_utils/mod.rs b/src/file_utils/mod.rs new file mode 100644 index 0000000..6f2644c --- /dev/null +++ b/src/file_utils/mod.rs @@ -0,0 +1 @@ +pub mod file_utility; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index d571da8..01e7267 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use crate::processors::split_by_size::split_file_by_size; use crate::processors::tpf_fasta::curate_fasta; use crate::processors::yaml_validator::validate_yaml; +mod file_utils; mod cli; mod generics; //use crate::generics::validate_fasta; From 4cb033a9e69cc49a45334ee12e82315c4131bfc4 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sat, 25 May 2024 23:40:07 +0100 Subject: [PATCH 11/30] Adding start and end pointers --- src/file_utils/file_utility.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 4cc5424..a7062e5 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -3,14 +3,19 @@ use std::io::{BufRead, BufReader}; use clap::Error; +#[allow(dead_code)] struct Records { lines: Vec } +#[allow(dead_code)] struct FileReader { - buffer: Vec // TODO: Make use of this internal buffer. + buffer: Vec, // TODO: Make use of this internal buffer. + startPtr: u16, // TODO: Use these pointers to read data chunks + endPtr: u16, } +#[allow(dead_code)] pub trait Default { fn default() -> Self; } @@ -18,7 +23,9 @@ pub trait Default { impl Default for FileReader { fn default() -> Self { FileReader { - buffer: Vec::::new() + buffer: Vec::::new(), + startPtr: 0, + endPtr: 0, } } } From f8ffdba436ac05426a70515ebf7c4b6d63370401 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sat, 25 May 2024 23:42:18 +0100 Subject: [PATCH 12/30] TODO docs --- src/file_utils/file_utility.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index a7062e5..f5d1207 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -34,6 +34,7 @@ impl FileReader { /* * Reads a specific number of lines from a file + * TODO: Propogate errors: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html#propagating-errors */ pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result { let file = File::open(file_path)?; From dcc4ed28419aff2b75f425f193b5b59677b99023 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sat, 25 May 2024 23:43:33 +0100 Subject: [PATCH 13/30] Ptr unit change --- src/file_utils/file_utility.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index f5d1207..9ba1627 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -11,8 +11,8 @@ struct Records { #[allow(dead_code)] struct FileReader { buffer: Vec, // TODO: Make use of this internal buffer. - startPtr: u16, // TODO: Use these pointers to read data chunks - endPtr: u16, + startPtr: usize, // TODO: Use these pointers to read data chunks + endPtr: usize, } #[allow(dead_code)] From ddc8414ba4a413a17ebdebd62e68ddbed40a0a04 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 10:58:16 +0100 Subject: [PATCH 14/30] Implementing batch-wise file-readl --- Cargo.lock | 16 ++++++ Cargo.toml | 1 + src/file_utils/file_utility.rs | 101 +++++++++++++++++++++------------ src/file_utils/mod.rs | 2 +- src/main.rs | 2 +- 5 files changed, 84 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a09d981..b1679d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -375,6 +375,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + [[package]] name = "enum-iterator" version = "1.5.0" @@ -431,6 +437,7 @@ dependencies = [ "compare", "csv", "io", + "itertools", "noodles", "regex", "serde", @@ -710,6 +717,15 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.9" diff --git a/Cargo.toml b/Cargo.toml index b6f6872..2cda670 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ colored = "2.0.4" compare = "0.1.0" csv = "1.3.0" io = "0.0.2" +itertools = "0.13.0" noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] } regex = "1.9.5" serde = { version = "1.0.188", features = ["derive"] } diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 9ba1627..cacd85d 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -2,74 +2,103 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use clap::Error; +use itertools::Itertools; #[allow(dead_code)] -struct Records { - lines: Vec +struct Records { + lines: Vec, } -#[allow(dead_code)] -struct FileReader { - buffer: Vec, // TODO: Make use of this internal buffer. - startPtr: usize, // TODO: Use these pointers to read data chunks - endPtr: usize, +impl Records { + pub fn size(&self) -> usize { + self.lines.len() + } } #[allow(dead_code)] -pub trait Default { +struct BatchFileReader {} + +#[allow(dead_code)] +pub trait DefaultReader { fn default() -> Self; } -impl Default for FileReader { +impl DefaultReader for BatchFileReader { fn default() -> Self { - FileReader { - buffer: Vec::::new(), - startPtr: 0, - endPtr: 0, - } + BatchFileReader {} } } -impl FileReader { - +impl BatchFileReader { /* - * Reads a specific number of lines from a file - * TODO: Propogate errors: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html#propagating-errors + * Reads a specific number of lines from a file from the top */ - pub fn read_file(&mut self, file_path: &str, num_lines: usize) -> Result { + pub fn read_lines( + &mut self, + file_path: &str, + num_lines: usize, + ) -> Result, Error> { let file = File::open(file_path)?; let reader = BufReader::new(file); - // This buffer will be stored in heap, and will popped off when read_file function goes out of scope. - let mut internal_buffer = vec![]; + let mut internal_buffer = Vec::::new(); // Error unwrapping: https://tinyurl.com/brt9fphk // take() function https://tinyurl.com/6vx7m3k6 for line in reader.lines().take(num_lines) { let result = line.expect("Error in reading file"); // This will panic if errored - internal_buffer.push(result); - }; + internal_buffer.push(result.clone()) + } - Ok(Records { lines: internal_buffer }) + Ok(Records { + lines: internal_buffer, + }) } + /** + * Reads a file batch by batch, and applies a function Fn for each chunk + */ + pub fn read_file_by_batch( + &mut self, + file_path: &str, + batch_size: usize, + f: &dyn Fn(Records) -> (), + ) -> Result<(), Error> { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + + for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) { + f(Records { + lines: chunk.collect(), + }); + } + + Ok(()) + } } #[cfg(test)] mod tests { - use core::panic; use super::*; + const TEST_FILE_PATH: &str = "test_data/synthetic/tiny.fa"; + #[test] - fn read_first_line() { - let mut fileReader = FileReader::default(); - match fileReader.read_file("test_data/synthetic/tiny.fa", 3) { - Ok(records) => { - assert_eq!(3, records.lines.len()) - } - Err(error) => { - panic!("{:?}", error) - } - } + fn read_lines() { + let mut BatchFileReader = BatchFileReader::default(); + let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap(); + assert_eq!(3, records.lines.len()); } -} \ No newline at end of file + + fn print_function(input: Records) -> () { + assert_eq!(true, input.size() <= 3); + } + + #[test] + fn read_file_batch() { + let mut BatchFileReader = BatchFileReader::default(); + BatchFileReader + .read_file_by_batch(TEST_FILE_PATH, 3, &print_function) + .unwrap(); + } +} diff --git a/src/file_utils/mod.rs b/src/file_utils/mod.rs index 6f2644c..79b7217 100644 --- a/src/file_utils/mod.rs +++ b/src/file_utils/mod.rs @@ -1 +1 @@ -pub mod file_utility; \ No newline at end of file +pub mod file_utility; diff --git a/src/main.rs b/src/main.rs index 01e7267..1568b0b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,8 +15,8 @@ use crate::processors::split_by_size::split_file_by_size; use crate::processors::tpf_fasta::curate_fasta; use crate::processors::yaml_validator::validate_yaml; -mod file_utils; mod cli; +mod file_utils; mod generics; //use crate::generics::validate_fasta; From 0904ef1e4ab8ed99b29fea958fe81d5d196425a0 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 11:05:29 +0100 Subject: [PATCH 15/30] Adding documentation --- src/file_utils/file_utility.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index cacd85d..8a0d1d3 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -56,6 +56,8 @@ impl BatchFileReader { /** * Reads a file batch by batch, and applies a function Fn for each chunk + * Function pointers documentation: https://doc.rust-lang.org/book/ch19-05-advanced-functions-and-closures.html#function-pointers + * f is a closure pushed into the stack of read_file_by_batch that is similar to an anonymous function in Java/JavaScript/C# */ pub fn read_file_by_batch( &mut self, @@ -66,6 +68,8 @@ impl BatchFileReader { let file = File::open(file_path)?; let reader = BufReader::new(file); + // map_while() Creates an iterator that both yields elements based on a predicate and maps. + // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) { f(Records { lines: chunk.collect(), @@ -90,6 +94,8 @@ mod tests { assert_eq!(3, records.lines.len()); } + // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context. + // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html fn print_function(input: Records) -> () { assert_eq!(true, input.size() <= 3); } From 28d05ca4cd789cce47989101e33429ccbad7cf33 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 11:12:25 +0100 Subject: [PATCH 16/30] Added new recordline type --- src/file_utils/file_utility.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 8a0d1d3..bd2b0d8 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -4,6 +4,8 @@ use std::io::{BufRead, BufReader}; use clap::Error; use itertools::Itertools; +struct RecordLine(String); + #[allow(dead_code)] struct Records { lines: Vec, From 37d3bdfcefa9cec2bb74aee61b7adad36e99cdd9 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 11:24:28 +0100 Subject: [PATCH 17/30] Clippy fixes --- src/file_utils/file_utility.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index bd2b0d8..f8c4ddd 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -65,7 +65,7 @@ impl BatchFileReader { &mut self, file_path: &str, batch_size: usize, - f: &dyn Fn(Records) -> (), + f: &dyn Fn(Records), ) -> Result<(), Error> { let file = File::open(file_path)?; let reader = BufReader::new(file); @@ -98,8 +98,8 @@ mod tests { // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context. // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html - fn print_function(input: Records) -> () { - assert_eq!(true, input.size() <= 3); + fn print_function(input: Records) { + assert!(input.size() <= 3); } #[test] From 0583492591d66b262e702b2b9754af6cc87f3d74 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 11:28:19 +0100 Subject: [PATCH 18/30] Allowing dead code (until they are used in upstream ilb calls) --- src/file_utils/file_utility.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index f8c4ddd..e12e74c 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -4,6 +4,7 @@ use std::io::{BufRead, BufReader}; use clap::Error; use itertools::Itertools; +#[allow(dead_code)] struct RecordLine(String); #[allow(dead_code)] @@ -11,6 +12,7 @@ struct Records { lines: Vec, } +#[allow(dead_code)] impl Records { pub fn size(&self) -> usize { self.lines.len() @@ -31,6 +33,7 @@ impl DefaultReader for BatchFileReader { } } +#[allow(dead_code)] impl BatchFileReader { /* * Reads a specific number of lines from a file from the top From b6bcb9c8688a3985cd8d2308629c733110faf1a5 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 11:30:06 +0100 Subject: [PATCH 19/30] Cargo fmt run --- src/file_utils/file_utility.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index e12e74c..44fe0c9 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -4,7 +4,7 @@ use std::io::{BufRead, BufReader}; use clap::Error; use itertools::Itertools; -#[allow(dead_code)] +#[allow(dead_code)] struct RecordLine(String); #[allow(dead_code)] @@ -12,7 +12,7 @@ struct Records { lines: Vec, } -#[allow(dead_code)] +#[allow(dead_code)] impl Records { pub fn size(&self) -> usize { self.lines.len() @@ -33,7 +33,7 @@ impl DefaultReader for BatchFileReader { } } -#[allow(dead_code)] +#[allow(dead_code)] impl BatchFileReader { /* * Reads a specific number of lines from a file from the top From 034abee3c1584e5459498ad39f1f10681b6c7a30 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 20:22:57 +0100 Subject: [PATCH 20/30] Adding some error handling logic and logging --- Cargo.lock | 7 ++++++ Cargo.toml | 1 + src/errors/file_error.rs | 26 ++++++++++++++++++++++ src/errors/mod.rs | 1 + src/file_utils/file_utility.rs | 40 ++++++++++++++++++++++++++-------- src/main.rs | 1 + 6 files changed, 67 insertions(+), 9 deletions(-) create mode 100644 src/errors/file_error.rs create mode 100644 src/errors/mod.rs diff --git a/Cargo.lock b/Cargo.lock index b1679d9..6868692 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -438,6 +438,7 @@ dependencies = [ "csv", "io", "itertools", + "log", "noodles", "regex", "serde", @@ -823,6 +824,12 @@ version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db" +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + [[package]] name = "lzma-sys" version = "0.1.20" diff --git a/Cargo.toml b/Cargo.toml index 2cda670..2f4477f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,3 +17,4 @@ regex = "1.9.5" serde = { version = "1.0.188", features = ["derive"] } serde_yaml = "0.9.25" stacker = "0.1.15" +log = "0.4.21" diff --git a/src/errors/file_error.rs b/src/errors/file_error.rs new file mode 100644 index 0000000..5e978da --- /dev/null +++ b/src/errors/file_error.rs @@ -0,0 +1,26 @@ +use std::fmt::{self}; + +use std::io::Error; + +// Define our error types. These may be customized for our error handling cases. +// Now we will be able to write our own errors, defer to an underlying error +// implementation, or do something in between. +// Resource: https://doc.rust-lang.org/rust-by-example/error/multiple_error_types/define_error_type.html +#[derive(Debug, Clone)] +pub struct FileError { + message: String, +} + +impl fmt::Display for FileError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Error in handling the file.") + } +} + +impl From for FileError { + fn from(error: Error) -> Self { + FileError { + message: format!("{}", error), + } + } +} diff --git a/src/errors/mod.rs b/src/errors/mod.rs new file mode 100644 index 0000000..6bf812b --- /dev/null +++ b/src/errors/mod.rs @@ -0,0 +1 @@ +pub mod file_error; diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 44fe0c9..4c544dd 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -1,7 +1,8 @@ +use log::{info}; use std::fs::File; use std::io::{BufRead, BufReader}; -use clap::Error; +use crate::errors::file_error::FileError; use itertools::Itertools; #[allow(dead_code)] @@ -42,9 +43,19 @@ impl BatchFileReader { &mut self, file_path: &str, num_lines: usize, - ) -> Result, Error> { - let file = File::open(file_path)?; - let reader = BufReader::new(file); + ) -> Result, FileError> { + info!("Reading lines in file."); + let file = File::open(file_path); + + let result = match file { + Ok(file) => file, + Err(error) => { + info!("Error in file handler: {:?}", error); + return Err(error.into()); + } + }; + + let reader = BufReader::new(result); let mut internal_buffer = Vec::::new(); // Error unwrapping: https://tinyurl.com/brt9fphk @@ -69,9 +80,20 @@ impl BatchFileReader { file_path: &str, batch_size: usize, f: &dyn Fn(Records), - ) -> Result<(), Error> { - let file = File::open(file_path)?; - let reader = BufReader::new(file); + ) -> Result<(), FileError> { + info!("Reading file by chunk."); + + let file = File::open(file_path); + + let result = match file { + Ok(file) => file, + Err(error) => { + info!("Error in file handler: {:?}", error); + return Err(error.into()); + } + }; + + let reader = BufReader::new(result); // map_while() Creates an iterator that both yields elements based on a predicate and maps. // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while @@ -101,7 +123,7 @@ mod tests { // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context. // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html - fn print_function(input: Records) { + fn assert_function(input: Records) { assert!(input.size() <= 3); } @@ -109,7 +131,7 @@ mod tests { fn read_file_batch() { let mut BatchFileReader = BatchFileReader::default(); BatchFileReader - .read_file_by_batch(TEST_FILE_PATH, 3, &print_function) + .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function) .unwrap(); } } diff --git a/src/main.rs b/src/main.rs index 1568b0b..d66d003 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,7 @@ use crate::processors::tpf_fasta::curate_fasta; use crate::processors::yaml_validator::validate_yaml; mod cli; +mod errors; mod file_utils; mod generics; //use crate::generics::validate_fasta; From c4ed2de31c16f9f93607581be5e0cba815408330 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 20:24:45 +0100 Subject: [PATCH 21/30] Fmt fixes --- src/file_utils/file_utility.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 4c544dd..b87000d 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -1,4 +1,4 @@ -use log::{info}; +use log::info; use std::fs::File; use std::io::{BufRead, BufReader}; From e436eff43388335ab8a81c7d55a30ffa6c0d8dab Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Sun, 26 May 2024 20:25:52 +0100 Subject: [PATCH 22/30] Allowing dead code until it's used. --- src/errors/file_error.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/errors/file_error.rs b/src/errors/file_error.rs index 5e978da..77ec0f8 100644 --- a/src/errors/file_error.rs +++ b/src/errors/file_error.rs @@ -7,6 +7,7 @@ use std::io::Error; // implementation, or do something in between. // Resource: https://doc.rust-lang.org/rust-by-example/error/multiple_error_types/define_error_type.html #[derive(Debug, Clone)] +#[allow(dead_code)] pub struct FileError { message: String, } From 7de11f2a031f995d771744bd9c9bb435f1c2e7a8 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Mon, 27 May 2024 08:30:25 +0100 Subject: [PATCH 23/30] Refactoring items --- src/file_utils/file_utility.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index b87000d..8a58c01 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -10,13 +10,13 @@ struct RecordLine(String); #[allow(dead_code)] struct Records { - lines: Vec, + items: Vec, } #[allow(dead_code)] impl Records { pub fn size(&self) -> usize { - self.lines.len() + self.items.len() } } @@ -66,7 +66,7 @@ impl BatchFileReader { } Ok(Records { - lines: internal_buffer, + items: internal_buffer, }) } @@ -99,7 +99,7 @@ impl BatchFileReader { // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) { f(Records { - lines: chunk.collect(), + items: chunk.collect(), }); } @@ -118,7 +118,7 @@ mod tests { fn read_lines() { let mut BatchFileReader = BatchFileReader::default(); let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap(); - assert_eq!(3, records.lines.len()); + assert_eq!(3, records.items.len()); } // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context. From ec7c7363423420a1a39f031a5cc9aa2b5aadbccc Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 31 May 2024 14:57:49 +0100 Subject: [PATCH 24/30] Removing dead code. --- src/file_utils/file_utility.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 8a58c01..1e0b69e 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -5,9 +5,6 @@ use std::io::{BufRead, BufReader}; use crate::errors::file_error::FileError; use itertools::Itertools; -#[allow(dead_code)] -struct RecordLine(String); - #[allow(dead_code)] struct Records { items: Vec, From 6e078ec1787edfdede3ceeec653aa805a0584563 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 31 May 2024 15:32:15 +0100 Subject: [PATCH 25/30] Added lib crate for modularity --- src/file_utils/file_utility.rs | 4 +- src/lib.rs | 83 +++++++++++++++++++++++++++++++++ src/main.rs | 85 +++------------------------------- 3 files changed, 93 insertions(+), 79 deletions(-) create mode 100644 src/lib.rs diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 1e0b69e..6ccb07f 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -71,6 +71,8 @@ impl BatchFileReader { * Reads a file batch by batch, and applies a function Fn for each chunk * Function pointers documentation: https://doc.rust-lang.org/book/ch19-05-advanced-functions-and-closures.html#function-pointers * f is a closure pushed into the stack of read_file_by_batch that is similar to an anonymous function in Java/JavaScript/C# + * https://doc.rust-lang.org/book/ch13-01-closures.html#moving-captured-values-out-of-closures-and-the-fn-traits + * Note that f is not intended to mutate the captured Records value, and should not return anything (i.e., move the captured Record value out of the closure). */ pub fn read_file_by_batch( &mut self, @@ -129,6 +131,6 @@ mod tests { let mut BatchFileReader = BatchFileReader::default(); BatchFileReader .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function) - .unwrap(); + .unwrap_or_else(|e| panic!("Error: {:?}", e)); } } diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..6fe7e03 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,83 @@ +use clap::Parser; + +use cli::{Cli, Commands}; +use std::io::Error; + +// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html +use crate::processors::exclude_seq::filter_fasta; +use crate::processors::map_headers::map_fasta_head; +use crate::processors::remap_head::remapping_head; +use crate::processors::split_by_count::split_file_by_count; +use crate::processors::split_by_size::split_file_by_size; +use crate::processors::tpf_fasta::curate_fasta; +use crate::processors::yaml_validator::validate_yaml; + +mod cli; +mod errors; +mod file_utils; +mod generics; +//use crate::generics::validate_fasta; + +mod processors; + +pub fn run() -> Result<(), Error> { + let cli = Cli::parse(); + + match &cli.command { + Some(Commands::YamlValidator { + yaml, + verbose, + output, + }) => validate_yaml(yaml, verbose, output), + Some(Commands::SplitByCount { + fasta_file, + output_directory, + data_type, + sanitise, + count, + }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count), + Some(Commands::SplitBySize { + fasta_file, + mem_size, + output_directory, + }) => split_file_by_size(fasta_file, mem_size, output_directory), + Some(Commands::MapHeaders { + fasta_file, + output_directory, + replace_with, + }) => _ = map_fasta_head(fasta_file, output_directory, replace_with), + Some(Commands::ReMapHeaders { + fasta_file, + output_directory, + map_file, + }) => remapping_head(fasta_file, output_directory, map_file), + Some(Commands::Curate { + fasta, + tpf, + sort, + output, + n_length, + }) => curate_fasta(fasta, tpf, sort, output, n_length), + Some(Commands::FilterFasta { + fasta, + output, + filter_list, + }) => filter_fasta(fasta, output, filter_list), + Some(Commands::GenesetCSVS { .. }) => { + todo!() + } + Some(Commands::Profile { .. }) => { + todo!() + } + Some(Commands::Subset { .. }) => { + todo!() + } + Some(Commands::Mergehaps { .. }) => { + todo!() + } + None => { + panic!("No command given!") + } + } + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index d66d003..8bf8d2f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,85 +2,14 @@ use std::io::Error; -use clap::Parser; - -use cli::{Cli, Commands}; - -// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html -use crate::processors::exclude_seq::filter_fasta; -use crate::processors::map_headers::map_fasta_head; -use crate::processors::remap_head::remapping_head; -use crate::processors::split_by_count::split_file_by_count; -use crate::processors::split_by_size::split_file_by_size; -use crate::processors::tpf_fasta::curate_fasta; -use crate::processors::yaml_validator::validate_yaml; - -mod cli; -mod errors; -mod file_utils; -mod generics; -//use crate::generics::validate_fasta; - -mod processors; +use fasta_manipulation::run; +// https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects fn main() -> Result<(), Error> { - let cli = Cli::parse(); - - match &cli.command { - Some(Commands::YamlValidator { - yaml, - verbose, - output, - }) => validate_yaml(yaml, verbose, output), - Some(Commands::SplitByCount { - fasta_file, - output_directory, - data_type, - sanitise, - count, - }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count), - Some(Commands::SplitBySize { - fasta_file, - mem_size, - output_directory, - }) => split_file_by_size(fasta_file, mem_size, output_directory), - Some(Commands::MapHeaders { - fasta_file, - output_directory, - replace_with, - }) => _ = map_fasta_head(fasta_file, output_directory, replace_with), - Some(Commands::ReMapHeaders { - fasta_file, - output_directory, - map_file, - }) => remapping_head(fasta_file, output_directory, map_file), - Some(Commands::Curate { - fasta, - tpf, - sort, - output, - n_length, - }) => curate_fasta(fasta, tpf, sort, output, n_length), - Some(Commands::FilterFasta { - fasta, - output, - filter_list, - }) => filter_fasta(fasta, output, filter_list), - Some(Commands::GenesetCSVS { .. }) => { - todo!() - } - Some(Commands::Profile { .. }) => { - todo!() - } - Some(Commands::Subset { .. }) => { - todo!() - } - Some(Commands::Mergehaps { .. }) => { - todo!() - } - None => { - panic!("No command given!") - } + if let Err(e) = run() { + eprintln!("Error: {}", e); + std::process::exit(1); + } else { + Ok(()) } - Ok(()) } From 9d263da75527df5e27c5edd04cfda4f028905c8e Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 31 May 2024 15:35:18 +0100 Subject: [PATCH 26/30] Updated clippy fixes --- src/file_utils/file_utility.rs | 8 ++++---- src/processors/yaml_validator.rs | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs index 6ccb07f..8a30eaa 100644 --- a/src/file_utils/file_utility.rs +++ b/src/file_utils/file_utility.rs @@ -115,8 +115,8 @@ mod tests { #[test] fn read_lines() { - let mut BatchFileReader = BatchFileReader::default(); - let records = BatchFileReader.read_lines(TEST_FILE_PATH, 3).unwrap(); + let mut batch_file_reader = BatchFileReader::default(); + let records = batch_file_reader.read_lines(TEST_FILE_PATH, 3).unwrap(); assert_eq!(3, records.items.len()); } @@ -128,8 +128,8 @@ mod tests { #[test] fn read_file_batch() { - let mut BatchFileReader = BatchFileReader::default(); - BatchFileReader + let mut batch_file_reader = BatchFileReader::default(); + batch_file_reader .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function) .unwrap_or_else(|e| panic!("Error: {:?}", e)); } diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs index f7a5172..a0a595c 100644 --- a/src/processors/yaml_validator.rs +++ b/src/processors/yaml_validator.rs @@ -29,9 +29,9 @@ struct Assembly { level: String, sample_id: String, latin_name: String, - classT: String, - asmVersion: u16, - gevalType: String, + class_t: String, + asm_version: u16, + geval_type: String, } #[derive(Debug, Serialize, Deserialize)] @@ -233,7 +233,7 @@ pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) { let genesets = contents.alignment.geneset.split(','); for set in genesets { let gene_alignment_path = contents.alignment.data_dir.clone() - + &contents.assembly.classT + + &contents.assembly.class_t + "/csv_data/" + set + "-data.csv"; @@ -242,7 +242,7 @@ pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) { println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue()); let synteny_full = - contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/"; + contents.synteny.synteny_genome_path.clone() + &contents.assembly.class_t + "/"; validate_paths(&synteny_full, "SYNTENY-FASTA"); validate_data(&synteny_full, "synteny"); From b0d7592bf23f00f6f96519263518a1a6ea4820ed Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 31 May 2024 16:03:13 +0100 Subject: [PATCH 27/30] Refactoring main function --- src/main.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8bf8d2f..3282090 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,13 @@ #![allow(non_snake_case)] -use std::io::Error; - use fasta_manipulation::run; // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects -fn main() -> Result<(), Error> { +fn main() { if let Err(e) = run() { eprintln!("Error: {}", e); std::process::exit(1); } else { - Ok(()) + println!("Done!"); } } From f0b5fbd8dddaeb5e637c284178359f12e2973317 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Fri, 31 May 2024 16:14:49 +0100 Subject: [PATCH 28/30] Refactoring lib --- src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6fe7e03..36ac37b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,6 @@ mod cli; mod errors; mod file_utils; mod generics; -//use crate::generics::validate_fasta; mod processors; @@ -76,7 +75,7 @@ pub fn run() -> Result<(), Error> { todo!() } None => { - panic!("No command given!") + println!("No command provided"); } } Ok(()) From 9b5caadb668e039b33bd7dd0f0778b3522bfbc93 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Tue, 4 Jun 2024 18:19:16 +0100 Subject: [PATCH 29/30] Setup human-panic --- Cargo.lock | 289 +++++++++++++++++++++++++++++++++++++++++++++++----- Cargo.toml | 1 + src/main.rs | 3 + 3 files changed, 266 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6868692..5baf7b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -44,8 +53,23 @@ dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", - "anstyle-wincon", + "anstyle-wincon 2.1.0", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 3.0.3", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] @@ -70,7 +94,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -80,7 +104,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", ] [[package]] @@ -98,6 +132,21 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +[[package]] +name = "backtrace" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "bit-vec" version = "0.6.3" @@ -179,12 +228,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.83" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" [[package]] name = "cfg-if" @@ -208,7 +254,7 @@ version = "4.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" dependencies = [ - "anstream", + "anstream 0.5.0", "anstyle", "clap_lex", "strsim", @@ -246,7 +292,7 @@ checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" dependencies = [ "is-terminal", "lazy_static", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -415,7 +461,7 @@ checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" dependencies = [ "errno-dragonfly", "libc", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -436,6 +482,7 @@ dependencies = [ "colored", "compare", "csv", + "human-panic", "io", "itertools", "log", @@ -587,6 +634,23 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + [[package]] name = "gmeta" version = "1.3.0" @@ -665,6 +729,22 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "human-panic" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c5d0e9120f6bca6120d142c7ede1ba376dd6bf276d69dd3dbe6cbeb7824179" +dependencies = [ + "anstream 0.6.14", + "anstyle", + "backtrace", + "os_info", + "serde", + "serde_derive", + "toml", + "uuid", +] + [[package]] name = "impl-codec" version = "0.6.0" @@ -715,9 +795,15 @@ checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", "rustix", - "windows-sys", + "windows-sys 0.48.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.13.0" @@ -976,12 +1062,32 @@ dependencies = [ "noodles-csi", ] +[[package]] +name = "object" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "os_info" +version = "3.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092" +dependencies = [ + "log", + "serde", + "windows-sys 0.52.0", +] + [[package]] name = "page_size" version = "0.6.0" @@ -1010,7 +1116,7 @@ version = "3.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be30eaf4b0a9fba5336683b38de57bb86d179a35862ba6bfcf57625d006bde5b" dependencies = [ - "proc-macro-crate 2.0.2", + "proc-macro-crate 2.0.0", "proc-macro2", "quote", "syn 1.0.109", @@ -1058,11 +1164,10 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "2.0.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b00f26d3400549137f92511a46ac1cd8ce37cb5598a96d382381458b992a5d24" +checksum = "7e8366a6159044a37876a2b9817124296703c586a5c92e2c53751fa06d8d43e8" dependencies = [ - "toml_datetime", "toml_edit 0.20.2", ] @@ -1122,6 +1227,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + [[package]] name = "rustc_version" version = "0.4.0" @@ -1141,7 +1252,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -1200,6 +1311,15 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "serde_spanned" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +dependencies = [ + "serde", +] + [[package]] name = "serde_yaml" version = "0.9.25" @@ -1281,11 +1401,26 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.22.14", +] + [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +dependencies = [ + "serde", +] [[package]] name = "toml_edit" @@ -1309,6 +1444,18 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_edit" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", +] + [[package]] name = "typenum" version = "1.17.0" @@ -1351,12 +1498,27 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "uuid" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +dependencies = [ + "getrandom", +] + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "winapi" version = "0.3.9" @@ -1385,7 +1547,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", ] [[package]] @@ -1394,13 +1565,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1409,42 +1596,90 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + [[package]] name = "winnow" version = "0.5.40" diff --git a/Cargo.toml b/Cargo.toml index 2f4477f..a523174 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,3 +18,4 @@ serde = { version = "1.0.188", features = ["derive"] } serde_yaml = "0.9.25" stacker = "0.1.15" log = "0.4.21" +human-panic = "2.0.0" diff --git a/src/main.rs b/src/main.rs index 3282090..5fafbc2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,12 @@ #![allow(non_snake_case)] +use human_panic::setup_panic; use fasta_manipulation::run; // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects fn main() { + // https://rust-cli.github.io/book/in-depth/human-communication.html + setup_panic!(); if let Err(e) = run() { eprintln!("Error: {}", e); std::process::exit(1); From 6b1d389430a31cd9ec115d73478f8c13a3e905fc Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Tue, 4 Jun 2024 18:21:43 +0100 Subject: [PATCH 30/30] Fix linting issues --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 5fafbc2..6163c35 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ #![allow(non_snake_case)] -use human_panic::setup_panic; use fasta_manipulation::run; +use human_panic::setup_panic; // https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects fn main() {