diff --git a/assnake/api/fs_helpers.py b/assnake/api/fs_helpers.py index 8e9683b..c26815e 100644 --- a/assnake/api/fs_helpers.py +++ b/assnake/api/fs_helpers.py @@ -49,6 +49,32 @@ def get_samples_from_dir(directory_with_reads, modify_name = None): for s in samples ] + if len(samples_list) == 0: + ending_variant = {'name': 'single_end', 'strands': {'R1': '', 'R2':'_R2'}} + variant_w_ext = ending_variant['strands']['R1'] + ext + globbing_path = os.path.join(directory_with_reads, '*' + variant_w_ext) + files_mathching_pattern = glob.glob(globbing_path) + + # get sample names + samples = [ + filename.split('/')[-1].replace(variant_w_ext, '') # Take last part of path (basename) and replace _R1/fastq.gz with nothing to get sample name + for filename in files_mathching_pattern + ] + + # prepare sample dicts + samples_list += [ + { + 'name_in_run': s, + 'modified_name': modify_name(s) if modify_name is not None else s, + + 'ending_variant_id': ending_variant['name'], + 'ending_variant_R1': ending_variant['strands']['R1'], + 'ending_variant_R2': ending_variant['strands']['R2'], + 'directory': directory_with_reads, + 'extension': ext + } + for s in samples + ] return pd.DataFrame(samples_list) @@ -105,17 +131,20 @@ def create_links(import_dir, samples, hard = False, create_dir_if_not_exist = Fa if rename: os.rename(src_r1, dst_r1) - os.rename(src_r2, dst_r2) + if os.path.exists(src_r2): + os.rename(src_r2, dst_r2) return if hard: copy2(src_r1, dst_r1) - copy2(src_r2, dst_r2) + if os.path.exists(src_r2): + copy2(src_r2, dst_r2) return try: os.symlink(src_r1, dst_r1) - os.symlink(src_r2, dst_r2) + if os.path.exists(src_r2): + os.symlink(src_r2, dst_r2) except FileExistsError as e: print(e) \ No newline at end of file diff --git a/assnake/api/loaders.py b/assnake/api/loaders.py index 7c69eea..fe78584 100644 --- a/assnake/api/loaders.py +++ b/assnake/api/loaders.py @@ -87,6 +87,16 @@ def load_sample(fs_prefix, df, preproc, df_sample, if report_size: size = os.path.getsize(r1) + os.path.getsize(r2) sample_dict.update({'size': bytes2human(size, symbols='iec'), 'bytes': size}) + elif os.path.isfile(r1): + containers.append(p) + if len(p) > len(final_preproc): + final_preproc = p + if report_size: + size = os.path.getsize(r1) + sample_dict.update({'size': bytes2human(size, symbols='iec'), 'bytes': size}) + else: + click.secho('There are no reads in a path: %s'%r1, fg='red') + exit() return {'df':df, 'df_sample':df_sample, 'df_sample':df_sample, @@ -108,7 +118,6 @@ def load_sample_set(wc_config, fs_prefix, df, preproc, samples_to_add = [], do_n do_not_add: list of sample names NOT to add pattern: sample names must match this glob pattern to be included. ''' - if wc_config is None: wc_config = load_wc_config() diff --git a/assnake/cli/assnake_cli.py b/assnake/cli/assnake_cli.py index 807b4de..ae60d6d 100644 --- a/assnake/cli/assnake_cli.py +++ b/assnake/cli/assnake_cli.py @@ -83,7 +83,7 @@ def dataset(): dataset.add_command(dataset_commands.df_import_reads) dataset.add_command(dataset_commands.df_delete) dataset.add_command(dataset_commands.rescan_dataset) - +dataset.add_command(dataset_commands.df_update_info_wrapper) #--------------------------------------------------------------------------------------- # assnake RESULT *** group diff --git a/assnake/cli/commands/dataset_commands.py b/assnake/cli/commands/dataset_commands.py index 049e63e..4bbbbbf 100644 --- a/assnake/cli/commands/dataset_commands.py +++ b/assnake/cli/commands/dataset_commands.py @@ -8,239 +8,248 @@ from zipfile import ZipFile from assnake.api.loaders import update_fs_samples_csv from pathlib import Path +from assnake.api.loaders import InputError +from assnake.core.config import read_assnake_instance_config + # some util def show_av_dict(dfs): - ''' - print out available datasets from dict of db's - ''' - avail_dfs = '' - for i, item in enumerate(dfs.keys()): - avail_dfs += '{}. {} \t'.format(i + 1, item) + ''' + print out available datasets from dict of db's + ''' + avail_dfs = '' + for i, item in enumerate(dfs.keys()): + avail_dfs += '{}. {} \t'.format(i + 1, item) - if avail_dfs == '': - avail_dfs = 'NA' - click.echo('Available: ' + avail_dfs) - exit(2) + if avail_dfs == '': + avail_dfs = 'NA' + click.echo('Available: ' + avail_dfs) + exit(2) # --------------------------------------------------------------------------------------- -# LIST +# LIST # --------------------------------------------------------------------------------------- @click.command(name='list') def df_list(): - """List datasets in database""" - dfs = assnake.Dataset.list_in_db() - if len(list(dfs.keys())) == 0: - click.echo('No datasets in your system yet!\nYou can create one by running\n' + - click.style(' assnake dataset create ', bg='blue', fg='white', bold=True)) - - for df in dfs.values(): - df_name = df['df'] - click.echo(click.style('' * 2 + df_name + ' ' * 2, fg='green', bold=True)) - # click.echo(' Filesystem prefix: ' + df.get('fs_prefix', '')) - click.echo(' Full path: ' + os.path.join(df.get('fs_prefix', ''), df['df'])) - # click.echo(' Description: ') - # dict_norm_print(df.get('description', '')) - click.echo('') + """List datasets in database""" + dfs = assnake.Dataset.list_in_db() + if len(list(dfs.keys())) == 0: + click.secho('No datasets in your system yet!\nYou can create one by running\n' + + click.style(' assnake dataset create ', bg='blue', fg='white', bold=True)) + + for df in dfs.values(): + df_name = df['df'] + click.secho(click.style('' * 2 + df_name + ' ' * 2, fg='green', bold=True)) + # click.echo(' Filesystem prefix: ' + df.get('fs_prefix', '')) + click.echo(' Full path: ' + os.path.join(df.get('fs_prefix', ''), df['df'])) + # click.echo(' Description: ') + # dict_norm_print(df.get('description', '')) + click.echo('') # --------------------------------------------------------------------------------------- -# CREATE +# CREATE # --------------------------------------------------------------------------------------- def check_df_name_avialability(df): - - dfs_in_db = assnake.Dataset.list_in_db() - if df in dfs_in_db: - click.secho('Dataset with such name already exists!. Aborting.') - exit() - return df + + dfs_in_db = assnake.Dataset.list_in_db() + if df in dfs_in_db: + click.secho('Dataset with such name already exists!. Aborting.') + exit() + return df def validate_df_prefix(df, fs_prefix, create_prefix = False, allow_not_empty_full_path = False): - """ - Ensures that can write to full_df_path and it is empty - """ - df = check_df_name_avialability(df) - if df is None: - exit() - - if not os.path.isabs(fs_prefix): - click.secho('fs_prefix is not absolute!') - exit() - full_df_path = os.path.join(fs_prefix, df) - - if os.path.isdir(full_df_path): - if not os.listdir(full_df_path) and os.access(full_df_path, os.W_OK): - click.echo('Great! ' + full_df_path + ' exists, is empty and assnake can write to that folder, initializing dataset here') - return full_df_path - elif os.listdir(full_df_path): - if allow_not_empty_full_path: - click.echo(full_df_path + ' exists, but it is not empty. allow_not_empty_full_path is true, existance logic will be handled downstream.') - return full_df_path - else: - click.echo('Oops =( ' + full_df_path + ' exists, but it is not empty. Aborting.') - return None - elif not os.access(full_df_path, os.W_OK): - click.echo('Oops =( ' + full_df_path + ' exists, but assnake doesnt have permissions to write there. Aborting.') - return None - else: - click.echo('Oops =( ' + full_df_path + ' exists, but it is not directory. Aborting.') - - - if os.path.isdir(fs_prefix): - if os.access(fs_prefix, os.W_OK): - click.echo('Great! ' + fs_prefix + ' exists, and assnake can write to that folder, initializing dataset in:') - click.echo(full_df_path + ' (Will be created)') - return full_df_path - else: - if not create_prefix: - click.echo('Oops =( ' + fs_prefix + ' doesnt exist, and create_prefix is set to False. Aborting.') - else: - click.echo('Great! ' + fs_prefix + ' will be created, and dataset will be initialized in:') - click.echo(full_df_path + ' (Will be created)') - return full_df_path - - return None + """ + Ensures that can write to full_df_path and it is empty + """ + df = check_df_name_avialability(df) + if df is None: + exit() + + if not os.path.isabs(fs_prefix): + click.secho('fs_prefix is not absolute!') + exit() + full_df_path = os.path.join(fs_prefix, df) + + if os.path.isdir(full_df_path): + if not os.listdir(full_df_path) and os.access(full_df_path, os.W_OK): + click.echo('Great! ' + full_df_path + ' exists, is empty and assnake can write to that folder, initializing dataset here') + return full_df_path + elif os.listdir(full_df_path): + if allow_not_empty_full_path: + click.echo(full_df_path + ' exists, but it is not empty. allow_not_empty_full_path is true, existance logic will be handled downstream.') + return full_df_path + else: + click.echo('Oops =( ' + full_df_path + ' exists, but it is not empty. Aborting.') + return None + elif not os.access(full_df_path, os.W_OK): + click.echo('Oops =( ' + full_df_path + ' exists, but assnake doesnt have permissions to write there. Aborting.') + return None + else: + click.echo('Oops =( ' + full_df_path + ' exists, but it is not directory. Aborting.') + + + if os.path.isdir(fs_prefix): + if os.access(fs_prefix, os.W_OK): + click.echo('Great! ' + fs_prefix + ' exists, and assnake can write to that folder, initializing dataset in:') + click.echo(full_df_path + ' (Will be created)') + return full_df_path + else: + if not create_prefix: + click.echo('Oops =( ' + fs_prefix + ' doesnt exist, and create_prefix is set to False. Aborting.') + else: + click.echo('Great! ' + fs_prefix + ' will be created, and dataset will be initialized in:') + click.echo(full_df_path + ' (Will be created)') + return full_df_path + + return None def check_absolute_path(path, path_name): - path = path.rstrip('\/') - if not os.path.isabs(path): - click.secho(path_name + ' is not absolute!') - exit() - return path + path = path.rstrip('\/') + if not os.path.isabs(path): + click.secho(path_name + ' is not absolute!') + exit() + return path @click.command(name='create') @click.option('--df-name', '-d', help='Name of the dataset', required = False) @click.option('--data-storage-folder', '-f', - help='Folder where you want to store your data. \ - Directory with provided df-name will be created inside this folder. MUST exist, may be not empty.\ - Resulting {storage-folder}/{df-name} directory MUST be empty, may not exist. \ - For regestiring existing datasets inside new Assnake instance use assnake dataset register', required=False) + help='Folder where you want to store your data. \ + Directory with provided df-name will be created inside this folder. MUST exist, may be not empty.\ + Resulting {storage-folder}/{df-name} directory MUST be empty, may not exist. \ + For regestiring existing datasets inside new Assnake instance use assnake dataset register', required=False) @click.option('--full-path-to-df', '-p', - help='Full path to dataset folder. \ - Last part of the path (basename) will be used as df-name. MUST be empty, may not exist. \ - For regestiring existing datasets inside new Assnake instance use assnake dataset register', required=False) + help='Full path to dataset folder. \ + Last part of the path (basename) will be used as df-name. MUST be empty, may not exist. \ + For regestiring existing datasets inside new Assnake instance use assnake dataset register', required=False) +@click.option('--data-type', '-t', + help='Type of NGS data. Illumina Metagenomic WGS and 16s are supported.', + required = False, + type=click.Choice(['METAGENOMIC_16s', 'METAGENOMIC_WGS', 'VIROME', 'RNA_SEQ'], case_sensitive=False)) @click.option('--first-preprocessing-name', - help='Name of your first preprocessing. raw by default. You want to set it to sra, for exaple if yoo are planning to download from NCBI. Purely cosmetic effect.', required=False, default = 'raw') + help='Name of your first preprocessing. raw by default. You want to set it to sra, for exaple if yoo are planning to download from NCBI. Purely cosmetic effect.', required=False, default = 'raw') @click.option('--description', '-D', nargs=2, multiple=True, required=False, type=click.Tuple([str, str]), - help='Add some description in this way ` assnake dataset create ... -D property_1 value_1 ... -D property_n value_n`') + help='Add some description in this way ` assnake dataset create ... -D property_1 value_1 ... -D property_n value_n`') @click.option('--quietly', '-q', is_flag=True, help='Doing it quietly. No questions.') @click.option('--test-data', '-t', is_flag=True, help='Download test data from Humann2 tutorial') @click.pass_obj -def df_create(config, df_name, data_storage_folder, full_path_to_df,first_preprocessing_name, description, quietly, test_data): - """Register your dataset inside ASSNAKE!\n - You can use it in interactive mode. - Usage: assnake dataset create [dataset] or -d [dataset] .. - - """ - - if ((df_name is None) or (data_storage_folder is None)): - if full_path_to_df is None: - click.echo('You must specify df_name AND data_storage_folder, OR full_path_to_df') - exit() - - else: # full_path_to_df road - # click.echo('df-name or data-storage-folder not set, using full_path_to_df') - click.echo('Using full_path_to_df') - full_path_to_df = check_absolute_path(full_path_to_df, 'full_path_to_df') - df = os.path.basename(full_path_to_df) - fs_prefix = os.path.dirname(full_path_to_df) - full_df_path = validate_df_prefix(df, fs_prefix, True) - - # df_name AND data_storage_folder road - elif (df_name is not None) and (data_storage_folder is not None): - if full_path_to_df is not None: - click.echo('df-name AND data-storage-folder are set, ignoring full_path_to_df') - - click.secho('Using df_name AND data_storage_folder') - data_storage_folder = check_absolute_path(data_storage_folder, 'data_storage_folder') - df = df_name # TODO VALIDATE - fs_prefix = data_storage_folder - full_df_path = validate_df_prefix(df, fs_prefix, True) - - os.makedirs(full_df_path, exist_ok=True) - os.makedirs(os.path.join(full_df_path, 'reads', first_preprocessing_name), exist_ok=True) - df_info = {'df': df, 'fs_prefix': fs_prefix, 'description': {}} - - assnake_db = config['config']['assnake_db'] - os.makedirs(os.path.join(assnake_db, 'datasets'), exist_ok=True) - df_path_in_assnake = os.path.join(assnake_db, 'datasets', df) - os.symlink(full_df_path, df_path_in_assnake, target_is_directory = True) - - with open(os.path.join(df_path_in_assnake, 'df_info.yaml'), 'w') as info_file: - yaml.dump(df_info, info_file, default_flow_style=False) - click.secho('Saved dataset ' + df + ' sucessfully!', fg='green') - - - if test_data: - download_from_url('http://kronos.pharmacology.dal.ca/public_files/tutorial_datasets/mgs_tutorial_Oct2017.zip', - os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017.zip')) - - with ZipFile(os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017.zip'), 'r') as zipObj: - - zipObj.extractall(os.path.join(fs_prefix, df,'./')) - shutil.rmtree(os.path.join(fs_prefix, df, 'reads'), ignore_errors=True) - os.makedirs(os.path.join(fs_prefix, df, 'reads'), exist_ok=True) - shutil.move (os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017/raw_data'), os.path.join(fs_prefix, df, 'reads/raw')) - # TODO cleanup - shutil.rmtree(os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017'), ignore_errors=True) +def df_create(config, data_type, df_name, data_storage_folder, full_path_to_df,first_preprocessing_name, description, quietly, test_data): + """Register your dataset inside ASSNAKE!\n + You can use it in interactive mode. + Usage: assnake dataset create [dataset] or -d [dataset] .. + + """ + + if ((df_name is None) or (data_storage_folder is None)): + if full_path_to_df is None: + click.echo('You must specify df_name AND data_storage_folder, OR full_path_to_df') + exit() + + else: # full_path_to_df road + # click.echo('df-name or data-storage-folder not set, using full_path_to_df') + click.echo('Using full_path_to_df') + full_path_to_df = check_absolute_path(full_path_to_df, 'full_path_to_df') + df = os.path.basename(full_path_to_df) + fs_prefix = os.path.dirname(full_path_to_df) + full_df_path = validate_df_prefix(df, fs_prefix, True) + + # df_name AND data_storage_folder road + elif (df_name is not None) and (data_storage_folder is not None): + if full_path_to_df is not None: + click.echo('df-name AND data-storage-folder are set, ignoring full_path_to_df') + + click.secho('Using df_name AND data_storage_folder') + data_storage_folder = check_absolute_path(data_storage_folder, 'data_storage_folder') + df = df_name # TODO VALIDATE + fs_prefix = data_storage_folder + full_df_path = validate_df_prefix(df, fs_prefix, True) + + os.makedirs(full_df_path, exist_ok=True) + os.makedirs(os.path.join(full_df_path, 'reads', first_preprocessing_name), exist_ok=True) + df_info = {'df': df, 'fs_prefix': fs_prefix, 'description': {}, 'data_type': data_type, + 'full_path':full_df_path, 'dataset_type':None, 'processes':{}} + + assnake_db = config['config']['assnake_db'] + os.makedirs(os.path.join(assnake_db, 'datasets'), exist_ok=True) + df_path_in_assnake = os.path.join(assnake_db, 'datasets', df) + os.symlink(full_df_path, df_path_in_assnake, target_is_directory = True) + + with open(os.path.join(df_path_in_assnake, 'df_info.yaml'), 'w') as info_file: + yaml.dump(df_info, info_file, default_flow_style=False) + click.secho('Saved dataset ' + df + ' sucessfully!', fg='green') + + + if test_data: + download_from_url('http://kronos.pharmacology.dal.ca/public_files/tutorial_datasets/mgs_tutorial_Oct2017.zip', + os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017.zip')) + + with ZipFile(os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017.zip'), 'r') as zipObj: + + zipObj.extractall(os.path.join(fs_prefix, df,'./')) + shutil.rmtree(os.path.join(fs_prefix, df, 'reads'), ignore_errors=True) + os.makedirs(os.path.join(fs_prefix, df, 'reads'), exist_ok=True) + shutil.move (os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017/raw_data'), os.path.join(fs_prefix, df, 'reads/raw')) + # TODO cleanup + shutil.rmtree(os.path.join(fs_prefix, df,'mgs_tutorial_Oct2017'), ignore_errors=True) # --------------------------------------------------------------------------------------- -# INIT +# INIT # --------------------------------------------------------------------------------------- @click.command(name='init', help = 'Register dataset in Assnake based on the folder from where you called the command. (Working directory)') @click.option('--data-type', '-t', - help='Type of NGS data. Illumina Metagenomic WGS and 16s are supported.', - required = False, - type=click.Choice(['METAGENOMIC_16s', 'METAGENOMIC_WGS', 'VIROME', 'RNA_SEQ'], case_sensitive=False)) + help='Type of NGS data. Illumina Metagenomic WGS and 16s are supported.', + required = False, + type=click.Choice(['METAGENOMIC_16s', 'METAGENOMIC_WGS', 'VIROME', 'RNA_SEQ'], case_sensitive=False)) @click.option('--df-name', '-d', help='Name of the dataset. If provided, folder with this name will be created in current dir.', required = False) @click.option('--first-preprocessing-name', - help='Name of your first preprocessing. raw by default. You want to set it to sra, for exaple if yoo are planning to download from NCBI. Purely cosmetic effect.', required=False, default = 'raw') + help='Name of your first preprocessing. raw by default. You want to set it to sra, for exaple if yoo are planning to download from NCBI. Purely cosmetic effect.', required=False, default = 'raw') @click.pass_obj def df_init(config, data_type, df_name, first_preprocessing_name): - cwd = os.getcwd() - - if df_name is None: - df = os.path.basename(cwd) - fs_prefix = os.path.dirname(cwd) - full_df_path = validate_df_prefix(df, fs_prefix, True, True) - else: - df = df_name - fs_prefix = cwd - full_df_path = validate_df_prefix(df, fs_prefix, True, True) - - full_path_empty = True - if os.path.isdir(full_df_path) and not os.listdir(full_df_path): - click.echo(full_df_path + ' not empty!') - click.echo('Trying to import as existing Assnake dataset (Not properly implemented)') - full_path_empty = False + cwd = os.getcwd() + + if df_name is None: + df = os.path.basename(cwd) + fs_prefix = os.path.dirname(cwd) + full_df_path = validate_df_prefix(df, fs_prefix, True, True) + else: + df = df_name + fs_prefix = cwd + full_df_path = validate_df_prefix(df, fs_prefix, True, True) + + full_path_empty = True + if os.path.isdir(full_df_path) and not os.listdir(full_df_path): + click.echo(full_df_path + ' not empty!') + click.echo('Trying to import as existing Assnake dataset (Not properly implemented)') + full_path_empty = False - # os.makedirs(os.path.join(full_df_path, 'reads', first_preprocessing_name), exist_ok=True) - df_info = {'df': df, 'fs_prefix': fs_prefix, 'description': {}, 'data_type': data_type} + # os.makedirs(os.path.join(full_df_path, 'reads', first_preprocessing_name), exist_ok=True) + df_info = {'df': df, 'fs_prefix': fs_prefix, 'description': {}, 'data_type': data_type, + 'full_path':full_df_path, 'dataset_type':None, 'processes':{}} - assnake_db = config['config']['assnake_db'] - os.makedirs(os.path.join(assnake_db, 'datasets'), exist_ok=True) - df_path_in_assnake = os.path.join(assnake_db, 'datasets', df) - os.symlink(full_df_path, df_path_in_assnake, target_is_directory = True) + assnake_db = config['config']['assnake_db'] + os.makedirs(os.path.join(assnake_db, 'datasets'), exist_ok=True) + df_path_in_assnake = os.path.join(assnake_db, 'datasets', df) + os.symlink(full_df_path, df_path_in_assnake, target_is_directory = True) - df_info_loc = os.path.join(df_path_in_assnake, 'df_info.yaml') - if not os.path.isfile(df_info_loc): - with open(df_info_loc, 'w') as info_file: - yaml.dump(df_info, info_file, default_flow_style=False) - click.secho('Saved dataset ' + df + ' sucessfully!', fg='green') + df_info_loc = os.path.join(df_path_in_assnake, 'df_info.yaml') + if not os.path.isfile(df_info_loc): + with open(df_info_loc, 'w') as info_file: + yaml.dump(df_info, info_file, default_flow_style=False) + click.secho('Saved dataset ' + df + ' sucessfully!', fg='green') # --------------------------------------------------------------------------------------- -# INFO +# INFO # --------------------------------------------------------------------------------------- @click.command(name='info') @click.option('--df', '-d', help='Name of the dataset', required=False) @@ -248,147 +257,164 @@ def df_init(config, data_type, df_name, first_preprocessing_name): @click.argument('df_arg', required=False) @click.pass_obj def df_info(config, df, preproc, df_arg): - """View info for the specific dataset - Usage: assnake dataset info [dataset] or -d [dataset] ... - - """ - # df argument/option logic - if not (bool(df is None) ^ bool(df_arg is None)): - click.echo('Please, specify dataset either as option or argument') - df = click.prompt('Type the name in') - if df is None: - df = df_arg - - # Trying to load dataset and display information - try: - df = assnake.Dataset(df) - click.echo(click.style('='*2 + ' '*3 + df.df + ' '*3 + '=' * 2, fg='green', bold=True)) - click.echo(str(df)) - if preproc is not None: - samples = df.sample_sets[preproc] - samples = samples.set_index('df_sample') - click.echo(tabulate(samples.sort_values('reads'), headers='keys', tablefmt='fancy_grid')) - return - except assnake.api.loaders.InputError as e: - print(e.message) - return + """View info for the specific dataset + Usage: assnake dataset info [dataset] or -d [dataset] ... + + """ + # df argument/option logic + if not (bool(df is None) ^ bool(df_arg is None)): + click.echo('Please, specify dataset either as option or argument') + df = click.prompt('Type the name in') + if df is None: + df = df_arg + + # Trying to load dataset and display information + try: + df_info_loc = config['config']['assnake_db']+'/datasets/{df}/df_info.yaml'.format(df = df) + df = assnake.Dataset(df) + click.secho(click.style('='*2 + ' '*3 + df.df + ' '*3 + '=' * 2, fg='green', bold=True)) + click.echo(str(df)) + if preproc is not None: + samples = df.sample_sets[preproc] + samples = samples.set_index('df_sample') + click.echo(tabulate(samples.sort_values('reads'), headers='keys', tablefmt='fancy_grid')) + df_info = {} + if os.path.isfile(df_info_loc): + with open(df_info_loc, 'r') as stream: + try: + info = yaml.load(stream, Loader=yaml.FullLoader) + if 'df' in info: + df_info = info + except yaml.YAMLError as exc: + pass + if df_info is not {}: + if 'processes' in df_info: + click.echo('Processes:') + for process_name, process_files in df_info['processes'].items(): + click.echo('\t%s'%process_name) + click.echo('\t\t- '+'\n\t\t- '.join(process_files)) + return + except InputError as e: + print(e.message) + return # --------------------------------------------------------------------------------------- -# DELETE +# DELETE # --------------------------------------------------------------------------------------- @click.command(name='delete') @click.option('--df', '-d', - help='Name of dataset to delete.', type=click.STRING ) + help='Name of dataset to delete.', type=click.STRING ) @click.option('--hard', help='If is set, hard removing will be used instead of modifying config file', is_flag=True) @click.argument('df_arg', required=False) @click.pass_obj def df_delete(config, df, hard, df_arg): - """ - Delete datasets - Usage: assnake dataset delete [dataset] or -d [dataset] ... - """ - click.echo('NOT IMPLEMENTED') - # if not (bool(df is None) ^ bool(df_arg is None)): - # click.echo('Please, specify dataset either as option or argument') - # df = click.prompt('Type the name in:') - # if df is None: - # df = df_arg - # dfs = assnake.Dataset.list_in_db() - # try: - # df_info = dfs[df] - # except KeyError as e: - # click.echo('Can`t reach database with such name') - # show_av_dict(dfs) - - # respond = fs_helpers.delete_ds(df) - # if respond[0]: - # click.secho('Successfully deleted', fg='bright_white', bg='green') - # else: - # click.secho('ERROR', bg='red') - # click.echo('For details see traceback below') - # click.echo(respond[1]) - - # if hard and click.confirm( - # 'Are you sure to delete this nice and probably huge datasets, which you may redownload for eternity -- use modifying config instead?', - # abort=True): - # shutil.rmtree(os.path.join(df_info.get('fs_prefix', ''), df)) + """ + Delete datasets + Usage: assnake dataset delete [dataset] or -d [dataset] ... + """ + click.echo('NOT IMPLEMENTED') + # if not (bool(df is None) ^ bool(df_arg is None)): + # click.echo('Please, specify dataset either as option or argument') + # df = click.prompt('Type the name in:') + # if df is None: + # df = df_arg + # dfs = assnake.Dataset.list_in_db() + # try: + # df_info = dfs[df] + # except KeyError as e: + # click.echo('Can`t reach database with such name') + # show_av_dict(dfs) + + # respond = fs_helpers.delete_ds(df) + # if respond[0]: + # click.secho('Successfully deleted', fg='bright_white', bg='green') + # else: + # click.secho('ERROR', bg='red') + # click.echo('For details see traceback below') + # click.echo(respond[1]) + + # if hard and click.confirm( + # 'Are you sure to delete this nice and probably huge datasets, which you may redownload for eternity -- use modifying config instead?', + # abort=True): + # shutil.rmtree(os.path.join(df_info.get('fs_prefix', ''), df)) # --------------------------------------------------------------------------------------- -# IMPORT-READS +# IMPORT-READS # --------------------------------------------------------------------------------------- # DONE decide if we need either d and t or proceed both arguments as one and automatically choose path or not @click.command(name='import-reads') @click.option('--reads-dir', '-r', prompt='Location of folder with read files', - help='Location of folder with read files', type=click.Path()) + help='Location of folder with read files', type=click.Path()) @click.option('--dataset', '-d', help='Assnake dataset name. If -t is not specified', required=False) @click.option('--rename-method', help='How to rename samples', type=click.Choice(['replace-', 'removeSending'], case_sensitive=False), required=False) @click.option('--target', '-t', help='Location of the target directory. If -d is not specified.', required=False, - type=click.Path()) + type=click.Path()) @click.option('--sample_set', '-s', help='Comma-divided list of samples of interest', required=False) @click.option('--sample_list', '-l', help='Location of file with line by line samples of interest', required=False, - type=click.Path()) + type=click.Path()) @click.option('--copy', help='If is set, hard copying will be used instead of symbolic links ', is_flag=True) @click.pass_obj def df_import_reads(config, reads_dir, dataset, rename_method, target, sample_set, sample_list, copy): - """ - Import reads from directory to assnake dataset. Currently local text files are supported. The --target argument - point to location (relative or absolute) of assnake dataset in your file system. Please, pay attention, - that -t and -d arguments are excclusive for each over -- specify only one of them -- as well as -s and -l. - With -s `sample_1,sample_2,...,sample_n` notation is implied (no whitespaces between sample names) - """ - - reads_dir = str(Path(reads_dir).resolve()) - # stuff about presence of arguments - arg_d = not bool(dataset is None) - arg_t = not bool(target is None) - arg_s = not bool(sample_set is None) - arg_l = not bool(sample_list is None) - - # check if samples arguments are ok - if arg_l & arg_s: - click.secho('Collision tends to be observed. Please, specify either list of samples in prompt or in file', - err=True) - exit(1) - - # check if destination args are ok - if not (arg_d ^ arg_t): - click.secho('Please, specify either database (-d) or absolute path (-t)', err=True) - exit(1) - - # some stuffff to ensure correctness of source and destination (how philosophical) - if arg_d: - try: - df_info = assnake.Dataset(dataset) - except assnake.loaders.InputError as e: - dfs = assnake.Dataset.list_in_db() - click.echo('Can`t reach database with such name', err=True) - show_av_dict(dfs) - target = '{}/reads/raw'.format(df_info.full_path) - else: - # Whaaat - target = pathizer(target) - if not os.path.exists(target): - click.secho("Provided sample-list file couldn't be detected", err=True) - exit(2) - - target = '{}/reads/raw'.format(df_info.full_path) - os.makedirs(target, exist_ok=True) - - if rename_method == 'removeSending': - modify_name=lambda arg: '_'.join(arg.replace('-', '_').split('_')[0:-1]) - else: - modify_name=lambda arg: arg.replace('-', '_') - - samples_in_run = fs_helpers.get_samples_from_dir(reads_dir, modify_name) - if len(samples_in_run) > 0: - samples_in_run['df_sample'] = samples_in_run['modified_name'] - fs_helpers.create_links(target, samples_in_run, hard=copy) - - update_fs_samples_csv(df_info.df) - click.secho("SUCCESSFULLY IMPORTED READS!", fg='green') - else: - click.secho('No reads in directory ' + reads_dir, fg='yellow') + """ + Import reads from directory to assnake dataset. Currently local text files are supported. The --target argument + point to location (relative or absolute) of assnake dataset in your file system. Please, pay attention, + that -t and -d arguments are excclusive for each over -- specify only one of them -- as well as -s and -l. + With -s `sample_1,sample_2,...,sample_n` notation is implied (no whitespaces between sample names) + """ + + reads_dir = str(Path(reads_dir).resolve()) + # stuff about presence of arguments + arg_d = not bool(dataset is None) + arg_t = not bool(target is None) + arg_s = not bool(sample_set is None) + arg_l = not bool(sample_list is None) + + # check if samples arguments are ok + if arg_l & arg_s: + click.secho('Collision tends to be observed. Please, specify either list of samples in prompt or in file', + err=True) + exit(1) + + # check if destination args are ok + if not (arg_d ^ arg_t): + click.secho('Please, specify either database (-d) or absolute path (-t)', err=True) + exit(1) + + # some stuffff to ensure correctness of source and destination (how philosophical) + if arg_d: + try: + df_info = assnake.Dataset(dataset) + except assnake.loaders.InputError as e: + dfs = assnake.Dataset.list_in_db() + click.echo('Can`t reach database with such name', err=True) + show_av_dict(dfs) + target = '{}/reads/raw'.format(df_info.full_path) + else: + # Whaaat + target = pathizer(target) + if not os.path.exists(target): + click.secho("Provided sample-list file couldn't be detected", err=True) + exit(2) + + target = '{}/reads/raw'.format(df_info.full_path) + os.makedirs(target, exist_ok=True) + + if rename_method == 'removeSending': + modify_name=lambda arg: '_'.join(arg.replace('-', '_').split('_')[0:-1]) + else: + modify_name=lambda arg: arg.replace('-', '_') + + samples_in_run = fs_helpers.get_samples_from_dir(reads_dir, modify_name) + if len(samples_in_run) > 0: + samples_in_run['df_sample'] = samples_in_run['modified_name'] + fs_helpers.create_links(target, samples_in_run, hard=copy) + df_info = assnake.Dataset(dataset) + df_update_info(df_name=dataset, dataset_type=df_info.dataset_type) + update_fs_samples_csv(df_info.df) + click.secho("SUCCESSFULLY IMPORTED READS!", fg='green') + else: + click.secho('No reads in directory ' + reads_dir, fg='yellow') @click.command(name='rescan') @@ -396,16 +422,146 @@ def df_import_reads(config, reads_dir, dataset, rename_method, target, sample_se @click.argument('df_arg', required=False) @click.pass_obj def rescan_dataset(config, dataset, df_arg): - """ - Now it just updates fs_samples.tsv in ./assnkae_db/{dataset}/ - - Usage: assnake dataset rescan [dataset] or -d [dataset] .. - """ - if not (bool(dataset is None) ^ bool(df_arg is None)): - click.echo('Please, specify dataset either as option or argument') - dataset = click.prompt('Type the name in:') - if dataset is None: - dataset = df_arg - success = update_fs_samples_csv(dataset) - if success: - click.secho('SUCCESSFULLY UPDATED INFORMATION IN DATABASE!', fg='green') + """ + Now it just updates fs_samples.tsv in ./assnkae_db/{dataset}/ + + Usage: assnake dataset rescan [dataset] or -d [dataset] .. + """ + if not (bool(dataset is None) ^ bool(df_arg is None)): + click.echo('Please, specify dataset either as option or argument') + dataset = click.prompt('Type the name in:') + if dataset is None: + dataset = df_arg + success = update_fs_samples_csv(dataset) + if success: + click.secho('SUCCESSFULLY UPDATED INFORMATION IN DATABASE!', fg='green') + + + +# --------------------------------------------------------------------------------------- +# UNPDATE-INFO +# --------------------------------------------------------------------------------------- +def validate_df_prefix_existence(fs_prefix, allow_not_empty_full_path=True): + """ + Ensures that can write to full_df_path and it is not empty + """ + if not os.path.isabs(fs_prefix): + click.secho('fs_prefix is not absolute!', fg='red') + exit() + + if os.path.isdir(fs_prefix): + if os.access(fs_prefix, os.W_OK): + click.echo('Great! ' + fs_prefix + ' exists, and assnake can write to that folder') + return fs_prefix + else: + click.secho('Oops =( ' + fs_prefix + ' exists, but assnake doesnt have permissions to write there. Aborting.', fg='red') + else: + click.secho('Oops =( ' + fs_prefix + ' does not exist. Aborting.', fg='red') + return None + +def validate_full_path_existence(full_df_path, allow_not_empty_full_path=True): + """ + Ensures that can write to full_df_path and it is not empty + """ + if not os.path.isabs(full_df_path): + click.secho('Full path is not absolute!', fg='red') + exit() + + if os.path.isdir(full_df_path): + if not os.listdir(full_df_path) and os.access(full_df_path, os.W_OK): + click.echo('Warning! ' + full_df_path + ' exists, but is empty.') + return full_df_path + elif os.listdir(full_df_path): + click.echo('Great! '+full_df_path + ' exists and it is not empty.') + return full_df_path + elif not os.access(full_df_path, os.W_OK): + click.secho('Oops =( ' + full_df_path + ' exists, but assnake doesnt have permissions to write there. Aborting.', fg='red') + return None + else: + click.secho('Oops =( ' + full_df_path + ' exists, but it is not directory. Aborting.', fg='red') + return None + + +@click.command(name='update-info', help = 'Update df_info.yml file.') +@click.option('--df-name', '-d', help='Name of the dataset.', required = True) +@click.option('--data-type', '-t', + help='Type of NGS data. Illumina Metagenomic WGS and 16s are supported.', + required = False, + type=click.Choice(['METAGENOMIC_16s', 'METAGENOMIC_WGS', 'VIROME', 'RNA_SEQ'], case_sensitive=False)) +@click.option('--dataset-type', '-r', help='Type of raw reads: either single-end or paired-end.', + type=click.Choice(['single-end', 'paired-end'], case_sensitive=False), required = False) +@click.option('--description', '-ds', help='Dataset description.', required = False) +@click.option('--prefix', '-pp', help='Prefix path to the dataset. If given, the full path will be changed automatically. If both given, nothing will happen automatically.', required = False) +@click.option('--full-path', '-fp', help='Full path to the dataset. If given, the prefix will be changed automatically. If both given, nothing will happen automatically.', required = False) +@click.option('--processes', '-p', help='Final files of all processes which were run by user with the assnake.', required = False) +@click.option('--clean_paths', '-c', help='Delete all processes paths that do not exist anymore.', required = False, is_flag=True) + +@click.pass_obj +def df_update_info_wrapper(config, df_name, data_type, dataset_type, description, prefix, full_path, processes, clean_paths): + df_update_info(df_name, data_type, dataset_type, description, prefix, full_path, processes, clean_paths) + +def df_update_info(df_name=None, data_type=None, dataset_type=None, description=None, prefix=None, full_path=None, processes=None, clean_paths=False): + if df_name is None: + click.secho('Please, specify the dataset name.', fg='red') + exit() + instance_config = read_assnake_instance_config() + + df_info_loc = instance_config['assnake_db']+'/datasets/{df}/df_info.yaml'.format(df = df_name) + df_info = {} + + if not os.path.isfile(df_info_loc): + raise InputError('NO DATASET ' + df_name) + + with open(df_info_loc, 'r') as stream: + try: + info = yaml.load(stream, Loader=yaml.FullLoader) + if 'df' in info: + df_info = info + except yaml.YAMLError as exc: + print(exc) + + was_changed = [] + for name, new_info in zip(['df', 'fs_prefix', 'description', 'data_type', + 'full_path', 'dataset_type', 'processes'], + [df_name, prefix, description, data_type, + full_path, dataset_type, processes]): + if new_info is not None: + df_info[name] = new_info + was_changed.append(name) + + prefix_df_name_validation = True + full_path_validation = True + if ('fs_prefix' in was_changed): + prefix_df_name_validation = validate_df_prefix_existence(df_info['fs_prefix']) + if ('full_path' not in was_changed) and (prefix_df_name_validation is not None): + df_info['full_path'] = '%s/%s'%(df_info['fs_prefix'], df_info['df']) + full_path_validation = validate_full_path_existence(df_info['full_path']) + + if ('full_path' in was_changed): + full_path_validation = validate_full_path_existence(df_info['full_path']) + if ('fs_prefix' not in was_changed) and (full_path_validation is not None): + df_info['prefix'] = os.dirname(df_info['full_path']) + prefix_df_name_validation = validate_df_prefix_existence(df_info['fs_prefix']) + + if clean_paths: + if 'processes' in df_info: + names_to_delete = [] + for process_name, process_files in df_info['processes'].items(): + process_files = [process_file for process_file in process_files + if os.path.exists('%s/%s'%(df_info['full_path'], process_file))] + if len(process_files) == 0: + names_to_delete.append(process_name) + else: + df_info['processes'][process_name] = process_files + for process_name in names_to_delete: + del df_info['processes'][process_name] + + if (full_path_validation is not None) and (prefix_df_name_validation is not None): + with open(df_info_loc, 'w') as info_file: + yaml.dump(df_info, info_file, default_flow_style=False) + click.secho('Dataset ' + df_name + ' was updated sucessfully!', fg='green') + else: + click.secho('Sorry, you are trying to setup wrong paths.', fg='red') + + + diff --git a/assnake/cli/commands/execute_commands.py b/assnake/cli/commands/execute_commands.py index fee87e1..bd3daaa 100644 --- a/assnake/cli/commands/execute_commands.py +++ b/assnake/cli/commands/execute_commands.py @@ -2,10 +2,10 @@ # Perform assnake *** RUN cli command # ############################################# -import click, os +import click, os, yaml from assnake.core.config import read_internal_config from assnake.api.loaders import update_fs_samples_csv - +from assnake.cli.commands.dataset_commands import df_update_info #--------------------------------------------------------------------------------------- # RUN @@ -47,13 +47,56 @@ def gather(config, threads, jobs, drmaa, run, touch): conda_prefix = config['config']['conda_dir'], drmaa=drmaa_param, touch = touch, - cores=jobs, nodes=jobs) - - print(config['requested_results']) + cores=jobs, nodes=jobs, + # ignore_ambiguity=True + ) + # print('STATUS') + # print(status) + # print() + # print(config['requests']) + # print(config['requests_storage']) + # print(config['requested_results']) if run: click.echo('Updating Datasets:' + str(config['requested_dfs'])) for requested_df in set(config['requested_dfs']): update_fs_samples_csv(requested_df) - print(config['requested_results']) \ No newline at end of file + if 'requests_storage' in config: + for df in config['requests_storage'].keys(): + df_info_loc = config['config']['assnake_db']+'/datasets/{df}/df_info.yaml'.format(df = df) + df_info = {} + if os.path.isfile(df_info_loc): + with open(df_info_loc, 'r') as stream: + try: + info = yaml.load(stream, Loader=yaml.FullLoader) + if 'df' in info: + df_info = info + except yaml.YAMLError as exc: + pass + if df_info is not {}: + processes = df_info['processes'] if 'processes' in df_info else {} + for process_name in config['requests_storage'][df]: + process_name = process_name.replace('-', '_') + if process_name not in processes: + processes[process_name] = [] + file_set = [filepath.replace(df_info['full_path']+'/', '') for filepath in config['requests_storage'][df][process_name] + if os.path.exists(filepath)] + processes[process_name] += file_set + df_update_info(df_name = df, processes=processes, clean_paths=True) + else: + click.secho("df_info.yaml could not be update, since this feature was not implemented for this function.", fg='yellow') + else: + if 'destroy_if_not_run' in config: + if 'files' in config['destroy_if_not_run']: + for file in config['destroy_if_not_run']['files']: + os.remove(file) + if 'directories' in config['destroy_if_not_run']: + for folder in config['destroy_if_not_run']['directories']: + if len(os.listdir(folder)) == 0: + os.rmdir(folder) + + + # print(config['requests']) + # print(config['requests_storage']) + # print(config['requested_results']) \ No newline at end of file diff --git a/assnake/core/dataset.py b/assnake/core/dataset.py index b15fd68..f77979f 100644 --- a/assnake/core/dataset.py +++ b/assnake/core/dataset.py @@ -1,6 +1,6 @@ import os, glob, yaml, time import pandas as pd -from assnake.api.loaders import load_sample, load_sample_set +from assnake.api.loaders import load_sample, load_sample_set, InputError from assnake.core.config import load_wc_config, read_assnake_instance_config from assnake.viz import plot_reads_count_change @@ -23,26 +23,29 @@ def __init__(self, df, include_preprocs=True): wc_config = load_wc_config() instance_config = read_assnake_instance_config() - df_info_loc = instance_config['assnake_db']+'/datasets/{df}/df_info.yaml'.format(df = df) - df_info = {} + self.df_info_loc = instance_config['assnake_db']+'/datasets/{df}/df_info.yaml'.format(df = df) + self.df_info = {} - if not os.path.isfile(df_info_loc): - raise assnake.api.loaders.InputError('NO DATASET ' + df) + if not os.path.isfile(self.df_info_loc): + raise InputError('NO DATASET ' + df) - with open(df_info_loc, 'r') as stream: + with open(self.df_info_loc, 'r') as stream: try: info = yaml.load(stream, Loader=yaml.FullLoader) if 'df' in info: - df_info = info + self.df_info = info except yaml.YAMLError as exc: print(exc) - reads_dir = os.path.join(df_info['fs_prefix'], df_info['df'], 'reads/*') + reads_dir = os.path.join(self.df_info['fs_prefix'], self.df_info['df'], 'reads/*') + dataset_type_checker_pattern = os.path.join(self.df_info['fs_prefix'], self.df_info['df'], 'reads/raw/*_R2.*') # check in raw preprocess folder if dataset is paired-end + dataset_type_checker = glob.glob(dataset_type_checker_pattern) preprocs = [p.split('/')[-1] for p in glob.glob(reads_dir)] preprocessing = {} - self.df = df_info['df'] - self.fs_prefix = df_info['fs_prefix'] + self.df = self.df_info['df'] + self.fs_prefix = self.df_info['fs_prefix'] + self.dataset_type = 'paired-end' if len(dataset_type_checker) > 0 else 'single-end' self.full_path = os.path.join(self.fs_prefix, self.df) if include_preprocs: @@ -102,6 +105,7 @@ def __str__(self): for preproc in preprocs: preprocessing_info = preprocessing_info + 'Samples in ' + preproc + ' - ' + str(len(self.sample_sets[preproc])) + '\n' return 'Dataset name: ' + self.df + '\n' + \ + 'Dataset type: ' + self.dataset_type + '\n' + \ 'Filesystem prefix: ' + self.fs_prefix +'\n' + \ 'Full path: ' + os.path.join(self.fs_prefix, self.df) + '\n' + preprocessing_info @@ -124,6 +128,24 @@ def to_dict(self): 'preprocs': preprocs } + def get_processinfo(self): + all_processes = [] + all_outputfiles = [] + processinfo = {} + for process, output_files in self.df_info['processes'].items(): + processinfo.update({process : pd.DataFrame.from_dict( + {'process':[process]*len(output_files), + 'output_file':output_files}).to_dict(orient='records')}) + all_processes.extend([process]*len(output_files)) + all_outputfiles.extend(output_files) + processinfo['all'] = pd.DataFrame.from_dict({'process':all_processes, + 'output_file':all_outputfiles}).to_dict(orient='records') + return { + 'df': self.df, + # 'fs_prefix': self.fs_prefix, + 'processinfo': processinfo + } + # TODO rework this stuff. This should register custom methods from modules in Dataset, like loading metaphlan # for entry_point in iter_entry_points('assnake.plugins'): diff --git a/assnake/core/preset_manager.py b/assnake/core/preset_manager.py index 14f8abb..bfff28f 100644 --- a/assnake/core/preset_manager.py +++ b/assnake/core/preset_manager.py @@ -86,11 +86,11 @@ def find_preset_by_name(self, preset_name): presets_glob = os.path.join(instance_config['assnake_db'], self.dir_in_database, '*.' + self.preset_file_format) presets = [p.split('/')[-1].replace('.'+self.preset_file_format, '') for p in glob.glob(presets_glob)] + print(presets_glob) preset_dicts = [ { 'preset_name' :p.split('.')[0], - 'preset_hash': p.split('.')[1], + 'preset_hash': p.split('.')[1] if '.' in p else '', 'full_name': p } for p in presets ] - return next((p for p in preset_dicts if p['preset_name'] == preset_name), None) - + return (next((p for p in preset_dicts if p['preset_name'] == preset_name), None), os.path.dirname(presets_glob)) diff --git a/assnake/core/result.py b/assnake/core/result.py index e0b3a6b..d9f0218 100644 --- a/assnake/core/result.py +++ b/assnake/core/result.py @@ -59,8 +59,15 @@ def generate_cli_command(self): @click.pass_obj def result_invocation(config, strand, **kwargs): sample_set, sample_set_name = generic_command_individual_samples(config, **kwargs) - config['requests'] += generate_result_list(sample_set, self.result_wc, strand=strand) - + generated_result_list = generate_result_list(sample_set, self.result_wc, strand=strand) + config['requests'] += generated_result_list + if 'requests_storage' not in config: + config['requests_storage'] = {} + if kwargs['df'] not in config['requests_storage']: + config['requests_storage'][kwargs['df']] = {} + if self.name not in config['requests_storage'][kwargs['df']]: + config['requests_storage'][kwargs['df']][self.name] = [] + config['requests_storage'][kwargs['df']][self.name] += generated_result_list return result_invocation elif self.input_type == 'illumina_sample': @@ -81,8 +88,16 @@ def result_invocation(config, **kwargs): sample_set, sample_set_name = generic_command_individual_samples( config, **kwargs) - config['requests'] += generate_result_list( - sample_set, self.result_wc, **kwargs) + + generated_result_list = generate_result_list(sample_set, self.result_wc, **kwargs) + config['requests'] += generated_result_list + if 'requests_storage' not in config: + config['requests_storage'] = {} + if kwargs['df'] not in config['requests_storage']: + config['requests_storage'][kwargs['df']] = {} + if self.name not in config['requests_storage'][kwargs['df']]: + config['requests_storage'][kwargs['df']][self.name] = [] + config['requests_storage'][kwargs['df']][self.name] += generated_result_list return result_invocation @@ -101,6 +116,13 @@ def result_invocation(config, strand, **kwargs): res_list = prepare_sample_set_tsv_and_get_results( sample_set_dir_wc, result_wc, df=kwargs['df'], sample_sets=sample_sets, strand=strand, overwrite=False) config['requests'] += res_list + if 'requests_storage' not in config: + config['requests_storage'] = {} + if kwargs['df'] not in config['requests_storage']: + config['requests_storage'][kwargs['df']] = {} + if self.name not in config['requests_storage'][kwargs['df']]: + config['requests_storage'][kwargs['df']][self.name] = [] + config['requests_storage'][kwargs['df']][self.name] += res_list return result_invocation @@ -120,6 +142,13 @@ def result_invocation(config, **kwargs): res_list = prepare_sample_set_tsv_and_get_results(sample_set_dir_wc, result_wc, sample_sets = sample_sets, **kwargs) config['requests'] += res_list + if 'requests_storage' not in config: + config['requests_storage'] = {} + if kwargs['df'] not in config['requests_storage']: + config['requests_storage'][kwargs['df']] = {} + if self.name not in config['requests_storage'][kwargs['df']]: + config['requests_storage'][kwargs['df']][self.name] = [] + config['requests_storage'][kwargs['df']][self.name] += res_list return result_invocation diff --git a/assnake/core/sample_set.py b/assnake/core/sample_set.py index 43e1359..d0de606 100644 --- a/assnake/core/sample_set.py +++ b/assnake/core/sample_set.py @@ -1,137 +1,151 @@ -import assnake.api.loaders -import assnake -from tabulate import tabulate -import click, os, datetime -import pandas as pd - - -def generic_command_dict_of_sample_sets(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs): - ''' - This returns several sample sets. - ''' - df_loaded = assnake.Dataset(df) - - sample_sets_dict = {} - - meta_loc = os.path.join(df_loaded.full_path, 'df_samples.tsv') - if os.path.isfile(meta_loc): - meta = pd.read_csv(meta_loc, sep = '\t') - if meta_column is not None: - if column_value is not None: - sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) - sample_sets_dict.update({sample_set_name: sample_set}) - else: # treat empty column_value as creating multiple sample_sets for each column_value - column_values = list(meta[meta_column].unique()) - for column_value in column_values: - sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) - sample_sets_dict.update({sample_set_name: sample_set}) - else: - sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) - sample_sets_dict.update({sample_set_name: sample_set}) - else: - sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) - sample_sets_dict.update({sample_set_name: sample_set}) - - return sample_sets_dict - -def generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs): - """ - Construct sample sets, has multiple options. - Returns dict or sample_sets based on the provided options. - - meta_column - factor column in sample metadata sheet. Cannot be combined with samples_to_add. exclude_samples has higher proirity. - column_value - value of column to select by. \ - Can be multiple - separated by commas without whitespace. \ - If meta_column is provided, bot no column_value is provided \ - - treat it like select all unique values of that column. - If multiple - one value - one sample_set. If --merge enabled - all values go in one sample_set - - assnake result request megahit -d FMT_FHM -c source run - - """ - exclude_samples = [] if exclude_samples == '' else [c.strip() for c in exclude_samples.split(',')] - samples_to_add = [] if samples_to_add == '' else [c.strip() for c in samples_to_add.split(',')] - - df_loaded = assnake.Dataset(df) - config['requested_dfs'] += [df_loaded.df] - - # Now for the meta column stuff - meta_loc = os.path.join(df_loaded.full_path, 'df_samples.tsv') - if os.path.isfile(meta_loc): - meta = pd.read_csv(meta_loc, sep = '\t') - if meta_column is not None: - if column_value is not None: - subset_by_col_value = meta.loc[meta[meta_column] == column_value] - if len(subset_by_col_value) > 0: - samples_to_add = list(subset_by_col_value['df_sample']) - - if preproc is None: - # LONGEST - click.echo('Preprocessing is not specified, using longest for now') - preproc = max(list(df_loaded.sample_sets.keys()), key=len) - - sample_set = assnake.api.loaders.load_sample_set(config['wc_config'], df_loaded.fs_prefix, df_loaded.df, preproc, samples_to_add=samples_to_add) - if len(exclude_samples) > 0 : - sample_set = sample_set.loc[~sample_set['df_sample'].isin(exclude_samples), ] - - # click.echo(tabulate(sample_set[['df_sample', 'reads', 'preproc']].sort_values('reads'), headers='keys', tablefmt='fancy_grid')) - - # construct sample set name for fs - if meta_column is None and column_value is None: - curr_date = datetime.datetime.now() - def_name = '{month}{year}'.format(month=curr_date.strftime("%b"), year=curr_date.strftime("%y")) - sample_set_name = def_name - else: - sample_set_name = meta_column + '__' + column_value - - return sample_set, sample_set_name - - -def generate_result_list(sample_set, wc_str, **kwargs): - res_list = [] - # print(kwargs) - kwargs.pop('df') - kwargs.pop('preproc') - for s in sample_set.to_dict(orient='records'): - preprocessing = s['preproc'] - - res_list.append(wc_str.format( - fs_prefix = s['fs_prefix'].rstrip('\/'), - df = s['df'], - preproc = preprocessing, - df_sample = s['df_sample'], - **kwargs - )) - return res_list - -def prepare_sample_set_tsv_and_get_results(sample_set_dir_wc, result_wc, df, sample_sets, overwrite,**kwargs): - res_list = [] - - df_loaded = assnake.Dataset(df) - - for sample_set_name in sample_sets.keys(): - sample_set_dir = sample_set_dir_wc.format(fs_prefix = df_loaded.fs_prefix, df = df, sample_set = sample_set_name) - sample_set_loc = os.path.join(sample_set_dir, 'sample_set.tsv') - - - sample_set = sample_sets[sample_set_name] - if not os.path.exists(sample_set_dir): - os.makedirs(sample_set_dir, exist_ok=True) - - if not os.path.isfile(sample_set_loc): - sample_set.to_csv(sample_set_loc, sep='\t', index=False) - else: - click.secho('Sample set with this name already exists!') - if overwrite: - sample_set.to_csv(sample_set_loc, sep='\t', index=False) - click.secho('Overwritten') - - - res_list += [result_wc.format( - fs_prefix = df_loaded.fs_prefix, - df = df_loaded.df, - sample_set = sample_set_name, - **kwargs - )] - +import assnake.api.loaders +import assnake +from tabulate import tabulate +import click, os, datetime +import pandas as pd + + +def generic_command_dict_of_sample_sets(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs): + ''' + This returns several sample sets. + ''' + df_loaded = assnake.Dataset(df) + + sample_sets_dict = {} + + meta_loc = os.path.join(df_loaded.full_path, 'df_samples.tsv') + if os.path.isfile(meta_loc): + meta = pd.read_csv(meta_loc, sep = '\t', index_col=0) + if meta_column is not None: + if column_value is not None: + sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) + sample_sets_dict.update({sample_set_name: sample_set}) + else: # treat empty column_value as creating multiple sample_sets for each column_value + column_values = list(meta[meta_column].unique()) + for column_value in column_values: + sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) + sample_sets_dict.update({sample_set_name: sample_set}) + else: + sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) + sample_sets_dict.update({sample_set_name: sample_set}) + else: + if meta_column is not None: + click.echo('A metadata column is specified, but there is no metadata file: %s'%meta_loc, fg='red') + exit() + sample_set, sample_set_name = generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs) + sample_sets_dict.update({sample_set_name: sample_set}) + + return sample_sets_dict + +def generic_command_individual_samples(config, df, preproc, meta_column, column_value, samples_to_add, exclude_samples, **kwargs): + """ + Construct sample sets, has multiple options. + Returns dict or sample_sets based on the provided options. + + meta_column - factor column in sample metadata sheet. Cannot be combined with samples_to_add. exclude_samples has higher proirity. + column_value - value of column to select by. \ + Can be multiple - separated by commas without whitespace. \ + If meta_column is provided, bot no column_value is provided \ + - treat it like select all unique values of that column. + If multiple - one value - one sample_set. If --merge enabled - all values go in one sample_set + + assnake result request megahit -d FMT_FHM -c source run + + """ + exclude_samples = [] if exclude_samples == '' else [c.strip() for c in exclude_samples.split(',')] + samples_to_add = [] if samples_to_add == '' else [c.strip() for c in samples_to_add.split(',')] + + df_loaded = assnake.Dataset(df) + config['requested_dfs'] += [df_loaded.df] + + # Now for the meta column stuff + meta_loc = os.path.join(df_loaded.full_path, 'df_samples.tsv') + if os.path.isfile(meta_loc): + meta = pd.read_csv(meta_loc, sep = '\t', index_col=0) + if meta_column is not None: + if column_value is not None: + subset_by_col_value = meta.loc[meta[meta_column] == column_value] + if len(subset_by_col_value) > 0: + samples_to_add = list(subset_by_col_value.index.values) + samples_to_add = list(set(df_loaded.sample_containers['df_sample'].values).intersection(set(samples_to_add))) + if len(samples_to_add) == 0: + click.secho('There are 0 samples for %s == %s'%(meta_column, column_value), fg='red') + exit() + + if preproc is None: + # LONGEST + click.echo('Preprocessing is not specified, using longest for now') + preproc = max(list(df_loaded.sample_sets.keys()), key=len) + + sample_set = assnake.api.loaders.load_sample_set(config['wc_config'], df_loaded.fs_prefix, df_loaded.df, preproc, samples_to_add=samples_to_add) + if len(exclude_samples) > 0 : + sample_set = sample_set.loc[~sample_set['df_sample'].isin(exclude_samples), ] + + # click.echo(tabulate(sample_set[['df_sample', 'reads', 'preproc']].sort_values('reads'), headers='keys', tablefmt='fancy_grid')) + + # construct sample set name for fs + curr_date = datetime.datetime.now() + def_name = '{date}{month}{year}_{hour}{minute}%s'.format( + date=curr_date.strftime("%d"), + month=curr_date.strftime("%b"), + year=curr_date.strftime("%y"), + hour=curr_date.strftime("%H"), + minute=curr_date.strftime("%M")) + if meta_column is None and column_value is None: + sample_set_name = def_name%('') + elif meta_column is not None and column_value is None: + sample_set_name = def_name%('__' + meta_column) + else: + sample_set_name = def_name%('__' + meta_column + '_' + column_value) + + return sample_set, sample_set_name + + +def generate_result_list(sample_set, wc_str, **kwargs): + res_list = [] + # print(kwargs) + kwargs.pop('df') + kwargs.pop('preproc') + for s in sample_set.to_dict(orient='records'): + preprocessing = s['preproc'] + + res_list.append(wc_str.format( + fs_prefix = s['fs_prefix'].rstrip('\/'), + df = s['df'], + preproc = preprocessing, + df_sample = s['df_sample'], + **kwargs + )) + return res_list + +def prepare_sample_set_tsv_and_get_results(sample_set_dir_wc, result_wc, df, sample_sets, overwrite,**kwargs): + res_list = [] + + df_loaded = assnake.Dataset(df) + + for sample_set_name in sample_sets.keys(): + sample_set_dir = sample_set_dir_wc.format(fs_prefix = df_loaded.fs_prefix, df = df, sample_set = sample_set_name) + sample_set_loc = os.path.join(sample_set_dir, 'sample_set.tsv') + + + sample_set = sample_sets[sample_set_name] + if not os.path.exists(sample_set_dir): + os.makedirs(sample_set_dir, exist_ok=True) + + if not os.path.isfile(sample_set_loc): + sample_set.to_csv(sample_set_loc, sep='\t', index=False) + else: + click.secho('Sample set with this name already exists!') + if overwrite: + sample_set.to_csv(sample_set_loc, sep='\t', index=False) + click.secho('Overwritten') + + + res_list += [result_wc.format( + fs_prefix = df_loaded.fs_prefix, + df = df_loaded.df, + sample_set = sample_set_name, + **kwargs + )] + return res_list \ No newline at end of file