diff --git a/README.md b/README.md index ffe490a7..9ad08166 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ -[![Documentation Status](https://readthedocs.org/projects/singlecellmultiomics/badge/?version=latest)](https://singlecellmultiomics.readthedocs.io/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/singlecellmultiomics.svg)](https://badge.fury.io/py/singlecellmultiomics) [![DOI](https://zenodo.org/badge/187592829.svg)](https://zenodo.org/badge/latestdoi/187592829) [![Anaconda-Server Badge](https://anaconda.org/buysdb/singlecellmultiomics/badges/installer/conda.svg)](https://anaconda.org/buysdb/singlecellmultiomics) - +[![Documentation Status](https://readthedocs.org/projects/singlecellmultiomics/badge/?version=latest)](https://singlecellmultiomics.readthedocs.io/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/singlecellmultiomics.svg)](https://badge.fury.io/py/singlecellmultiomics) [![DOI](https://zenodo.org/badge/187592829.svg)](https://zenodo.org/badge/latestdoi/187592829) ## Single cell multi omics -Single cell multi omics is a set of tools to deal with multiple measurements from the same cell. This package has been developed by the [van Oudenaarden group](https://www.hubrecht.eu/research-groups/van-oudenaarden-group/). +Single cell multi omics is a set of tools to deal with multiple measurements from the same cell. This package is maintained by [Barbanson Biotech](https://barbansonbiotech.com/). # Installation ``` @@ -32,7 +31,7 @@ The mapped reads are encoded in a BAM file. This BAM file still contains the enc methylation digest sequencing:SC MSPJI , lineage tracing:SCARTRACE, DNA digest sequencing: NLAIII, - histone modification sequencing: scCHIC, + Epigenetic modification sequencing: scCHIC, scCHIC+Transcriptome, DamID, DamID+T Single cell methylation : TAPs (in combination with any other supported protocol). 4) Assigns reads to molecules to allow for deduplication, adds duplication BAM flag diff --git a/setup.py b/setup.py index 72f5bbac..5fac5fe9 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from setuptools import setup +from setuptools import setup, find_namespace_packages import os import sys @@ -24,36 +24,12 @@ long_description=long_description, long_description_content_type='text/markdown', author='Buys de Barbanson', - author_email='b.barbanson@hubrecht.eu', + author_email='github@barbansonbiotech.com', url='https://github.com/BuysDB/SingleCellMultiOmics', download_url = 'https://github.com/BuysDB/SingleCellMultiOmics/archive/v0.1.9.tar.gz', license='MIT', - packages=['singlecellmultiomics', - - 'singlecellmultiomics.alleleTools', - 'singlecellmultiomics.bamProcessing', - 'singlecellmultiomics.barcodeFileParser', - 'singlecellmultiomics.countTableProcessing', - 'singlecellmultiomics.features', - 'singlecellmultiomics.fragment', - 'singlecellmultiomics.fastqProcessing', - 'singlecellmultiomics.fastaProcessing', - 'singlecellmultiomics.libraryDetection', - 'singlecellmultiomics.libraryProcessing', - 'singlecellmultiomics.modularDemultiplexer', - 'singlecellmultiomics.molecule', - 'singlecellmultiomics.methylation', - 'singlecellmultiomics.pyutils', - 'singlecellmultiomics.variants', - 'singlecellmultiomics.tags', - 'singlecellmultiomics.statistic', - 'singlecellmultiomics.tagtools', - 'singlecellmultiomics.universalBamTagger', - 'singlecellmultiomics.utils', - 'singlecellmultiomics.modularDemultiplexer.demultiplexModules' - ], - + packages=find_namespace_packages(), scripts=[ # Demultiplexing @@ -121,7 +97,6 @@ # Library processing: 'singlecellmultiomics/libraryProcessing/libraryStatistics.py', 'singlecellmultiomics/libraryProcessing/scsortchicstats.py', - 'singlecellmultiomics/libraryDetection/archivestats.py', 'singlecellmultiomics/alleleTools/heterozygousSNPedit.py', 'singlecellmultiomics/libraryProcessing/scsortchicfeaturedensitytable.py', 'singlecellmultiomics/libraryProcessing/scsortchicqc.py', diff --git a/singlecellmultiomics/libraryDetection/archivestats.py b/singlecellmultiomics/libraryDetection/archivestats.py deleted file mode 100644 index e0cf3310..00000000 --- a/singlecellmultiomics/libraryDetection/archivestats.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import argparse - -import pkg_resources -import singlecellmultiomics.barcodeFileParser.barcodeFileParser as barcodeFileParser -from singlecellmultiomics.modularDemultiplexer.demultiplexingStrategyLoader import DemultiplexingStrategyLoader -import singlecellmultiomics.libraryDetection.sequencingLibraryListing as sequencingLibraryListing -import glob -import fnmatch -import os -from types import SimpleNamespace - -if __name__ == '__main__': - barcode_dir = pkg_resources.resource_filename( - 'singlecellmultiomics', 'modularDemultiplexer/barcodes/') - index_dir = pkg_resources.resource_filename( - 'singlecellmultiomics', 'modularDemultiplexer/indices/') - - barcode_parser = barcodeFileParser.BarcodeParser( - hammingDistanceExpansion=0, barcodeDirectory=barcode_dir) - index_parser = barcodeFileParser.BarcodeParser( - hammingDistanceExpansion=1, barcodeDirectory=index_dir) - - dmx = DemultiplexingStrategyLoader(barcodeParser=barcode_parser, - indexParser=index_parser, - indexFileAlias=None) - - argparser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="""Check multiplexability of many fastq files""") - - argparser.add_argument('-locations', default='.') - - arguments = argparser.parse_args() - sequencing_dirs = arguments.locations.split(',') - dmxes = sorted([x.shortName for x in dmx.demultiplexingStrategies]) - #print('\t'.join(['RUN', 'SEQ', 'AVO', 'INDEX', 'LIBRARY'] + dmxes)) - - matches = [] - for sdir in sequencing_dirs: - for root, dirnames, filenames in os.walk(sdir): - for d in dirnames: - try: - fp = os.path.join(root, d) - #print(' ' + fp) - if len(list(glob.glob(fp + '/*.fastq.gz')) - ) and d != 'BaseCalls': - matches.append(fp) - fastqfiles = list(glob.glob(fp + '/*.fastq.gz')) - - args = SimpleNamespace( - replace=None, - fastqfiles=fastqfiles, - slib=None, - merge='_', - dsize=10000, - se=False, - ignore=True, - maxAutoDetectMethods=100, - minAutoDetectPct=1) - libraries = sequencingLibraryListing.SequencingLibraryLister( - verbose=False).detect(fastqfiles, args=args) - - processedReadPairs, strategyYieldsForAllLibraries = dmx.detectLibYields( - libraries, testReads=args.dsize, maxAutoDetectMethods=args.maxAutoDetectMethods, minAutoDetectPct=args.minAutoDetectPct, verbose=False) - - #print(strategyYieldsForAllLibraries) - - if False: - for library, associated_fastqs_lane in libraries.items(): - # Obtain run id - run_id = '?' - seqid = '?' - index = '?' - avo_id = '?' - i = None - dpos = None - for lane, reads in associated_fastqs_lane.items(): - parts = os.path.dirname( - reads['R1'][0]).split('/') - - if 'Data' in parts: - try: - i = parts.index('Data') - except Exception as e: - i = -2 - - try: - dpos = parts.index('BaseCalls') - except Exception as e: - dpos = -1 - pass - - try: - run_id = parts[i - 2] - except Exception as e: - pass - - try: - seqid = parts[i - 1] - except Exception as e: - pass - - try: - index = parts[dpos + 2] - except Exception as e: - pass - - try: - avo_id = parts[dpos + 1] - except Exception as e: - pass - - else: - try: - avo_id = parts[-2] - except Exception as e: - pass - - try: - index = parts[-1] - except Exception as e: - pass - - break - - - - for library in libraries: - processedReadPairs = strategyYieldsForAllLibraries[library]['processedReadPairs'] - strategyYieldForLibrary = strategyYieldsForAllLibraries[library]['strategyYields'] - selectedStrategies = dmx.selectedStrategiesBasedOnYield( - processedReadPairs, - strategyYieldForLibrary, - maxAutoDetectMethods=args.maxAutoDetectMethods, - minAutoDetectPct=args.minAutoDetectPct) - selectedStrategies = dmx.getSelectedStrategiesFromStringList( - selectedStrategies, verbose=False) - - print(library,selectedStrategies[0].shortName) - - #print('\t'.join([run_id, seqid, avo_id, index, library] + [str( - # strategyYieldsForAllLibraries[library]['strategyYields'].get(x, 0) ) for x in dmxes])) - except Exception as e: - raise - #print(e) diff --git a/singlecellmultiomics/libraryProcessing/libraryStatistics.py b/singlecellmultiomics/libraryProcessing/libraryStatistics.py index 99ed0c99..58f9e852 100644 --- a/singlecellmultiomics/libraryProcessing/libraryStatistics.py +++ b/singlecellmultiomics/libraryProcessing/libraryStatistics.py @@ -87,6 +87,10 @@ def select_fastq_file(lookup): argparser.add_argument('--v', action='store_true') argparser.add_argument('--nort', action='store_true') argparser.add_argument('--nolorenz', action='store_true') + + argparser.add_argument('-demux_R1_path', type=str) + argparser.add_argument('-demux_R2_path', type=str) + args = argparser.parse_args() for library in args.libraries: @@ -127,6 +131,13 @@ def select_fastq_file(lookup): if(args.t in ['chic-stats', 'all-stats']): statistics.extend([ScCHICLigation(args)]) + if args.t=='non-scmo-stats': + statistics.extend([ + ScCHICLigation(args) + + ]) + + if(args.t in ['demult-stats', 'all-stats']): statistics.extend([ TrimmingStats(args), @@ -137,13 +148,20 @@ def select_fastq_file(lookup): PlateStatistic2(args) ]) - demuxFastqFilesLookup = [ - (f'{library}/demultiplexedR1.fastq.gz', - f'{library}/demultiplexedR2.fastq.gz'), - (f'{library}/demultiplexedR1_val_1.fq.gz', - f'{library}/demultiplexedR2_val_2.fq.gz'), - (f'{library}/demultiplexedR1_val_1.fq', - f'{library}/demultiplexedR2_val_2.fq')] + if args.demux_R1_path is not None: + assert args.demux_R2_path is not None + demuxFastqFilesLookup = [ + (args.demux_R1_path,args.demux_R2_path), + ] + else: + demuxFastqFilesLookup = [ + (f'{library}/demultiplexedR1.fastq.gz', + f'{library}/demultiplexedR2.fastq.gz'), + (f'{library}/demultiplexedR1_val_1.fq.gz', + f'{library}/demultiplexedR2_val_2.fq.gz'), + (f'{library}/demultiplexedR1_val_1.fq', + f'{library}/demultiplexedR2_val_2.fq') + ] rejectFilesLookup = [ (f'{library}/rejectsR1.fastq.gz', f'{library}/rejectsR2.fastq.gz'), diff --git a/singlecellmultiomics/libraryProcessing/scsortchicqc.py b/singlecellmultiomics/libraryProcessing/scsortchicqc.py index 06ecfe14..007a6348 100644 --- a/singlecellmultiomics/libraryProcessing/scsortchicqc.py +++ b/singlecellmultiomics/libraryProcessing/scsortchicqc.py @@ -95,7 +95,6 @@ def read_contaminant_info(sortchicstats_paths): statistics_paths = [] count_table_paths = [] for path in args.count_tables_sortchicstats_statistics: - if path.endswith('statistics.pickle.gz'): statistics_paths.append(path) elif path.endswith('sortchicstats.json'): @@ -111,7 +110,10 @@ def read_contaminant_info(sortchicstats_paths): # Read the count tables df = pd.concat([read_count_table(path) for path in count_table_paths]) # Add mark as first level of df, library second, cell third - df.index = pd.MultiIndex.from_tuples([(sample_sheet['marks'][cell.split('_')[0]], cell.split('_')[0], int(cell.split('_')[1])) for cell in df.index]) + df.index = pd.MultiIndex.from_tuples([( + sample_sheet['marks'][cell.split('_')[0]], + cell.split('_')[0], int(cell.split('_')[1])) + for cell in df.index]) avail_marks = df.index.get_level_values(0).unique() print('Target marks:') @@ -135,7 +137,8 @@ def read_contaminant_info(sortchicstats_paths): y = cell_labels=='empty' rf = RandomForestClassifier(class_weight='balanced') - X = plate_stats.loc[y.index] + y=y.loc[[idx for idx in y.index if idx in plate_stats.index]] + X = plate_stats.loc[[idx for idx in y.index if idx in plate_stats.index]] X[('AA', 'ligated molecules')]/=X[('total mapped', '# molecules')] X[('TA', 'fraction ligated molecules')]= X[('TA', 'ligated molecules')] / X[('total mapped', '# molecules')] X[('TT', 'ligated molecules')]/=X[('total mapped', '# molecules')] @@ -143,7 +146,8 @@ def read_contaminant_info(sortchicstats_paths): X[('duprate', 'pct')] =X[('total mapped', '# molecules')]/X[('total mapped', '# reads')] y[X[('total mapped','# reads')]<500] = True - X = X.join(contaminant_info) + X = X.join(contaminant_info).fillna(0) + X = X.replace([np.inf,], 0) predictions = [] for train_index, test_index in KFold(n_splits=8, shuffle=True, random_state=None).split(X): diff --git a/singlecellmultiomics/libraryProcessing/scsortchicstats.py b/singlecellmultiomics/libraryProcessing/scsortchicstats.py index 1152b446..66002253 100644 --- a/singlecellmultiomics/libraryProcessing/scsortchicstats.py +++ b/singlecellmultiomics/libraryProcessing/scsortchicstats.py @@ -9,19 +9,43 @@ from collections import Counter, defaultdict import numpy as np +def safe_read(read): + if read.is_unmapped or \ + not read.is_proper_pair or \ + read.is_qcfail: + return False + if not read.is_read1: + return False + if read.mapping_quality<60: + return False + if read.has_tag('NM') and read.get_tag('NM')>2: + return False + # Check for indels + if 'I' in read.cigarstring or 'S' in read.cigarstring or 'D' in read.cigarstring: + return False + return True + + + def count_reads(args): bam, contig = args passreads = 0 - qcfailreads, duplicate_reads = 0, 0 + unmapped_reads, qcfailreads, duplicate_reads, safe_reads = 0, 0, 0, 0 with pysam.AlignmentFile(bam) as al: for read in al.fetch(contig): - if read.is_qcfail: + if read.is_supplementary or read.is_secondary: + continue + if read.is_unmapped: + unmapped_reads+=1 + elif read.is_qcfail: qcfailreads+=1 elif read.is_duplicate: duplicate_reads+=1 else: passreads+=1 - return contig, passreads, qcfailreads, duplicate_reads + if safe_read(read): + safe_reads+=1 + return contig, passreads, qcfailreads, duplicate_reads, unmapped_reads, safe_reads def count_reads_sc(args): bam, contig = args @@ -47,34 +71,67 @@ def np_encoder(object): description='Obtain statistics for scSort-ChIC libraries') argparser.add_argument('bam',type=str, help='Path to tagged bam file') - argparser.add_argument('featurebeds',type=str, nargs='+', help='Path to feature coverage bed files') + argparser.add_argument('featurebeds',type=str, nargs='*', help='Path to feature coverage bed files') argparser.add_argument('-o',type=str, help='Path to output json file') + + argparser.add_argument('-contaminants_species', type=str,help='Path to contaminant specie files', nargs='*') + argparser.add_argument('-t',type=int, help='Threadcount') argparser.add_argument('--sc',action= 'store_true', help='Generate also stats on a single cell basis') + argparser.add_argument('-mito_id', help='Add MT / chrM here to flag the mitochondria as contaminant') args = argparser.parse_args() - - scaffolds = [('Cutibacterium','CP025935.1'), - ('E. coli RHB09-C15','CP057942.1'), - ('E. coli K-12','NC_000913.3'), - ('E. coli Lambda Phage','J02459.1'), - ('Mitochondrial','MT') - ] + if args.contaminants_species is not None: + print('Passed:', args.contaminants_species) + contaminant_contigs = [] + contig_to_spec=dict() + for path in args.contaminants_species: + print(path) + species_table = pd.read_csv(path, delimiter='\t', header=None, + index_col=None) + species_table.columns = ['species', 'contig', 'gi'] + species_table = species_table.set_index('contig') + print(species_table) + contig_to_spec.update( species_table['species'].to_dict() ) + + contaminant_contigs += [ + (spec,contig) for contig,spec in contig_to_spec.items() + ] + print(contig_to_spec) + if args.mito_id is not None: + contaminant_contigs+= [ ('Mitochondrial', args.mito_id) ] + contig_to_spec[args.mito_id] = 'Mitochondrial' + else: + contaminant_contigs = [('Cutibacterium','CP025935.1'), + ('E. coli RHB09-C15','CP057942.1'), + ('E. coli K-12','NC_000913.3'), + ('E. coli Lambda Phage','J02459.1'), + ('Mitochondrial',args.mito_id) + ] # df_alignments = pd.DataFrame([ [(int(x) if i>0 else x) for i,x in enumerate(r.split('\t'))] for r in pysam.idxstats(args.bam).split('\n')]) # df_alignments = df_alignments.set_index(0) cnts = dict() # pass read counts - qcfailreads, duplicatereads = 0, 0 + qcfailreads, duplicatereads, usablereads, unmapped_reads, n_passreads, n_safe_reads = 0, 0, 0, 0, 0, 0 + # The safe reads are pass reads which map uniquely, and in a proper pair + with pysam.AlignmentFile(args.bam) as al: with Pool(args.t) as workers: - for contig, passreads, _qcfailreads, _duplicatereads in workers.imap_unordered( count_reads, ((args.bam,contig) for contig in al.references)): - cnts[contig] = passreads #dict(workers.imap_unordered( count_reads, ((args.bam,contig) for contig in al.references))) + for contig, passreads, _qcfailreads, _duplicatereads, _unmapped_reads, _safe_reads in workers.imap_unordered( + count_reads, ((args.bam,contig) for contig in list(al.references)+['*'])): + # Only use uniquely mappable reads to determine contig counts: + cnts[contig] = _safe_reads #dict(workers.imap_unordered( count_reads, ((args.bam,contig) for contig in al.references))) qcfailreads += _qcfailreads duplicatereads += _duplicatereads + unmapped_reads += _unmapped_reads + if contig not in contig_to_spec: + usablereads += passreads + n_safe_reads+=_safe_reads + n_passreads+= passreads cnts = pd.Series( cnts ) - passreads = cnts.sum() + safely_mappables = cnts.sum() totals = {} for path in args.featurebeds: @@ -82,17 +139,25 @@ def np_encoder(object): df.columns=['contig','start','end','obs'] totals[path.split('_')[-2]] = df['obs'].sum() - scaffold_coverage = pd.Series({ - name:float(100*cnts[scaffold]/passreads) for name, scaffold in scaffolds + scaffold_coverages_per_species = Counter() + for spec, contig in contaminant_contigs: + scaffold_coverages_per_species[spec] += cnts.get(contig,0) + # Normalize: + scaffold_coverage = pd.Series({ + name:float(100*n_reads/safely_mappables) for name, n_reads in scaffold_coverages_per_species.items() }).to_dict() + + d = { - 'feature_coverage': {k:float(v) for k,v in (100*pd.Series(totals)/passreads).to_dict().items()}, + 'feature_coverage': {k:float(v) for k,v in (100*pd.Series(totals)/usablereads).to_dict().items()}, 'scaffold_coverage':scaffold_coverage, - 'passreads':int(passreads), + 'passreads':int(n_passreads), 'duplicatereads': duplicatereads, 'qcfailreads': qcfailreads, - 'totalreads': passreads+duplicatereads+qcfailreads + 'totalreads': n_passreads+duplicatereads+qcfailreads+unmapped_reads, + 'usablereads':usablereads, + 'unmapped_reads':unmapped_reads } if args.sc: @@ -105,9 +170,11 @@ def np_encoder(object): sc_qcfailreads += _qcfailreads sc_duplicatereads += _duplicatereads - sc_scaffold_cov = dict() - for scaffold_name, scaffold in scaffolds: - sc_scaffold_cov[scaffold_name] = {k:int(v) for k,v in sc_cnts[scaffold].items()} + sc_scaffold_cov = defaultdict(Counter) # contaminant_group_name -> cell -> count + for contaminant_group_name, contig in contaminant_contigs: + for cell, n in sc_cnts[contig].items(): + sc_scaffold_cov[contaminant_group_name][cell] += n + d['sc_se_scaffold_cov'] = sc_scaffold_cov diff --git a/singlecellmultiomics/snakemake_workflows/scmo_workflow.py b/singlecellmultiomics/snakemake_workflows/scmo_workflow.py index 8c51f217..96b406d2 100644 --- a/singlecellmultiomics/snakemake_workflows/scmo_workflow.py +++ b/singlecellmultiomics/snakemake_workflows/scmo_workflow.py @@ -2,16 +2,16 @@ # -*- coding: utf-8 -*- import singlecellmultiomics import argparse -import pkg_resources from colorama import Fore, Style import os from shutil import copyfile - +import importlib.resources def get_workflow_list(include_hidden=False): - return [ workflow - for workflow in pkg_resources.resource_listdir('singlecellmultiomics','snakemake_workflows') - if not workflow.endswith('.py') and (include_hidden or not workflow.startswith('_') ) ] + workflows_dir = importlib.resources.files('singlecellmultiomics').joinpath('snakemake_workflows') + return [workflow.name + for workflow in workflows_dir.iterdir() + if workflow.is_dir() and (include_hidden or not workflow.name.startswith('_'))] def deploy_workflow_files(name, clean=False, directory='./'): @@ -28,18 +28,19 @@ def deploy_workflow_files(name, clean=False, directory='./'): raise ValueError(f"Unkown workflow {name}") - base_path = f'snakemake_workflows/{name}' - for file_to_copy in pkg_resources.resource_listdir('singlecellmultiomics',base_path): + base_path = importlib.resources.files('singlecellmultiomics').joinpath(f'snakemake_workflows/{name}') + for file_to_copy in base_path.iterdir(): # Files starting with a _ are never copied - if file_to_copy.startswith('_'): + if file_to_copy.name.startswith('_'): continue - print(Fore.GREEN + f'Creating {file_to_copy}' + Style.RESET_ALL, end="\t") + print(Fore.GREEN + f'Creating {file_to_copy.name}' + Style.RESET_ALL, end="\t") - target = directory+f'/{file_to_copy}' + target = os.path.join(directory, file_to_copy.name) if os.path.exists(target) and not clean: print(Fore.RED + f'The file {target} already exists! Skipping!' + Style.RESET_ALL) else: - copyfile( pkg_resources.resource_filename('singlecellmultiomics', base_path+'/'+file_to_copy),target ) + with importlib.resources.path('singlecellmultiomics', f'snakemake_workflows/{name}/{file_to_copy.name}') as source: + copyfile(source, target) print(Fore.GREEN + f'[ok]' + Style.RESET_ALL) if __name__=='__main__': diff --git a/singlecellmultiomics/statistic/plate.py b/singlecellmultiomics/statistic/plate.py index 50666540..a82ee2de 100644 --- a/singlecellmultiomics/statistic/plate.py +++ b/singlecellmultiomics/statistic/plate.py @@ -166,7 +166,7 @@ def cell_counts_to_dataframe(self, cell_counts, mux, name='raw_reads'): offset = 0 # Offset is zero for all protocols since 0.1.12 - format = 384 if ('384' in mux or mux.startswith('CS2')) else 96 + format = 96 if ( mux.startswith('CS1C8U4') or mux.startswith('NLAIII96') or mux.startswith('MSPJIC8U3')) else 384 df['col'] = [index2well[format] [(offset + int(x.rsplit('_')[-1]))][1] for x in df.index] diff --git a/singlecellmultiomics/statistic/plate2.py b/singlecellmultiomics/statistic/plate2.py index aeb2793b..340c3ac3 100644 --- a/singlecellmultiomics/statistic/plate2.py +++ b/singlecellmultiomics/statistic/plate2.py @@ -49,7 +49,7 @@ def processRead(self, R1,R2=None, default_lib=None): if read.has_tag('MX'): mux= read.get_tag('MX') - format = 384 if ('384' in mux or mux.startswith('CS2')) else 96 + format = 96 if ( mux.startswith('CS1C8U4') or mux.startswith('NLAIII96') or mux.startswith('MSPJIC8U3')) else 384 if format==384: n_rows = 16 n_cols = 24 diff --git a/singlecellmultiomics/universalBamTagger/bamtagmultiome.py b/singlecellmultiomics/universalBamTagger/bamtagmultiome.py index 829bf2e5..47dd596a 100755 --- a/singlecellmultiomics/universalBamTagger/bamtagmultiome.py +++ b/singlecellmultiomics/universalBamTagger/bamtagmultiome.py @@ -20,7 +20,7 @@ from singlecellmultiomics.fastaProcessing import CachedFastaNoHandle from singlecellmultiomics.utils.prefetch import UnitialisedClass from singlecellmultiomics.utils import create_fasta_dict_file -from multiprocessing import Pool +import multiprocessing from typing import Generator import argparse import uuid @@ -401,40 +401,50 @@ def tag_multiome_multi_processing( # @todo : Progress indication bam_files_generated = [] - - if use_pool: - workers = Pool(n_threads) - job_generator = workers.imap_unordered(run_tagging_tasks, tasks) - - else: - job_generator = (run_tagging_tasks(task) for task in tasks) - total_processed_molecules = 0 - for bam, meta in job_generator: - if bam is not None: - bam_files_generated.append(bam) - if len(meta): - total_processed_molecules+=meta['total_molecules'] - timeouts = meta.get('timeout_tasks',[]) - for timeout in timeouts: - print('blacklisted', timeout['contig'], timeout['start'], timeout['end']) - add_blacklisted_region(input_header, - contig=timeout['contig'], - start=timeout['start'], - end=timeout['end'] - ) - if head is not None and total_processed_molecules>head: - print('Head was supplied, stopping') - break - tagged_bam_generator = [temp_header_bam_path] + bam_files_generated + if use_pool: + with multiprocessing.get_context('spawn').Pool(n_threads) as workers: + for bam, meta in workers.imap_unordered(run_tagging_tasks, tasks): + if bam is not None: + bam_files_generated.append(bam) + if len(meta): + total_processed_molecules+=meta['total_molecules'] + timeouts = meta.get('timeout_tasks',[]) + for timeout in timeouts: + print('blacklisted', timeout['contig'], timeout['start'], timeout['end']) + add_blacklisted_region(input_header, + contig=timeout['contig'], + start=timeout['start'], + end=timeout['end'] + ) + if head is not None and total_processed_molecules>head: + print('Head was supplied, stopping') + break + tagged_bam_generator = [temp_header_bam_path] + bam_files_generated + else: # + for bam, meta in (run_tagging_tasks(task) for task in tasks): + if bam is not None: + bam_files_generated.append(bam) + if len(meta): + total_processed_molecules+=meta['total_molecules'] + timeouts = meta.get('timeout_tasks',[]) + for timeout in timeouts: + print('blacklisted', timeout['contig'], timeout['start'], timeout['end']) + add_blacklisted_region(input_header, + contig=timeout['contig'], + start=timeout['start'], + end=timeout['end'] + ) + if head is not None and total_processed_molecules>head: + print('Head was supplied, stopping') + break with pysam.AlignmentFile(temp_header_bam_path, 'wb', header=input_header) as out: pass pysam.index(temp_header_bam_path) # merge the results and clean up: - if use_pool: - workers.close() + print('Merging final bam files') merge_bams(list(tagged_bam_generator), out_bam_path, threads=n_threads) @@ -605,7 +615,7 @@ def run_multiome_tagging(args): unphased_alleles(str) : Path to VCF containing unphased germline SNPs - mapfile (str) : 'Path to .safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py + mapfile (str) : 'Path to /*.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py annotmethod (int) : Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base diff --git a/singlecellmultiomics/universalBamTagger/universalBamTagger.py b/singlecellmultiomics/universalBamTagger/universalBamTagger.py index 0593e944..fd30bf0b 100644 --- a/singlecellmultiomics/universalBamTagger/universalBamTagger.py +++ b/singlecellmultiomics/universalBamTagger/universalBamTagger.py @@ -1,13 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import re import pysam -import time -import sys -import multiprocessing -import subprocess import os -from singlecellmultiomics.utils.sequtils import hamming_distance, phred_to_prob +from singlecellmultiomics.utils.sequtils import hamming_distance from singlecellmultiomics.universalBamTagger.taps import TAPSFlagger from singlecellmultiomics.universalBamTagger.tag import TagFlagger from singlecellmultiomics.universalBamTagger.scar import ScarFlagger @@ -17,9 +12,6 @@ from singlecellmultiomics.universalBamTagger.digest import DigestFlagger from singlecellmultiomics.universalBamTagger.rna import RNA_Flagger import warnings -import numpy as np -import math -import singlecellmultiomics.features import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods from singlecellmultiomics.tagtools import tagtools import argparse @@ -27,9 +19,7 @@ import gzip import pickle from singlecellmultiomics.alleleTools import alleleTools -import uuid import collections -import glob c = 1_000 # !!! PLEASE USE PYTHON 3.6 OR HIGHER !!! complement = str.maketrans('ATGC', 'TACG') diff --git a/singlecellmultiomics/version.py b/singlecellmultiomics/version.py index 6531da5f..b155c640 100644 --- a/singlecellmultiomics/version.py +++ b/singlecellmultiomics/version.py @@ -1,2 +1,2 @@ -__version__ = '0.1.40' +__version__ = '0.1.41' diff --git a/tests/test_aux_data.py b/tests/test_aux_data.py index 70ac4a74..5b581935 100644 --- a/tests/test_aux_data.py +++ b/tests/test_aux_data.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import unittest -import pkg_resources import singlecellmultiomics +import importlib.resources """ These tests check if the Molecule module is working correctly """ @@ -11,7 +11,7 @@ class TestAuxData(unittest.TestCase): def test_barcode_files_presence(self): """Test if the barcode files can be accessed""" - available_barcodes = pkg_resources.resource_listdir('singlecellmultiomics','modularDemultiplexer/barcodes/') + available_barcodes = [p.name for p in (importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')).iterdir() if p.is_file()] at_least_required = ['nla_bisulfite.bc', 'lennart96NLA.bc', @@ -26,11 +26,11 @@ def test_barcode_files_presence(self): 'celseq2.bc'] for a in at_least_required: - self.assertIn(a,at_least_required) + self.assertIn(a,available_barcodes) def test_index_files_presence(self): """Test if the index files can be accessed""" - available_barcodes = pkg_resources.resource_listdir('singlecellmultiomics','modularDemultiplexer/indices/') + available_barcodes = [p.name for p in (importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/indices/')).iterdir() if p.is_file()] at_least_required = ['illumina_TruSeq_indices.bc', 'illumina_RP_indices.bc', @@ -40,7 +40,7 @@ def test_index_files_presence(self): 'illumina_i7_indices.bc'] for a in at_least_required: - self.assertIn(a,at_least_required) + self.assertIn(a,available_barcodes) if __name__ == '__main__': unittest.main() diff --git a/tests/test_barcodeFileParser.py b/tests/test_barcodeFileParser.py index e734755a..ddb7c1b9 100644 --- a/tests/test_barcodeFileParser.py +++ b/tests/test_barcodeFileParser.py @@ -3,7 +3,7 @@ import unittest import itertools import singlecellmultiomics.barcodeFileParser.barcodeFileParser as barcodeFileParser -import pkg_resources +import importlib.resources """ These tests check if the barcode file parser is working correctly @@ -26,7 +26,7 @@ def test_insert(self): self.assertEqual(barcode,'AAA') self.assertEqual(hd,0) - def test_hamming_expansion(self): + def test_hamming_expansion_algo(self): b = barcodeFileParser.BarcodeParser() b.addBarcode( barcodeFileAlias = 'test', barcode='AAA', index='TEST1') @@ -103,10 +103,9 @@ def test_hamming_expansion_collisions(self): def lazy_load_case(self, lazyloadsetting): - b = barcodeFileParser.BarcodeParser(pkg_resources.resource_filename( - 'singlecellmultiomics', - 'modularDemultiplexer/barcodes/'), lazyLoad = lazyloadsetting,hammingDistanceExpansion=1) - + b = barcodeFileParser.BarcodeParser( + str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + , lazyLoad = lazyloadsetting,hammingDistanceExpansion=1) index, barcode, hd = b.getIndexCorrectedBarcodeAndHammingDistance('GAATCTCG','celseq1') self.assertEqual(index,49) diff --git a/tests/test_demultiplexing.py b/tests/test_demultiplexing.py index 5b334d9a..6f21e280 100644 --- a/tests/test_demultiplexing.py +++ b/tests/test_demultiplexing.py @@ -7,15 +7,15 @@ from singlecellmultiomics.modularDemultiplexer.demultiplexModules.scCHIC import SCCHIC_384w_c8_u3_cs2 from singlecellmultiomics.modularDemultiplexer.demultiplexModules.DamID import DamID2andT_SCA,DamID2_SCA from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod -import pkg_resources from singlecellmultiomics.utils import reverse_complement +import importlib.resources class TestUmiBarcodeDemux(unittest.TestCase): def test_UmiBarcodeDemuxMethod_matching_barcode(self): - barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/') - index_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/indices/') + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + index_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/indices/')) barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*') index_parser = BarcodeParser(index_folder, lazyLoad='*') @@ -53,8 +53,8 @@ def test_UmiBarcodeDemuxMethod_matching_barcode(self): def test_CS2_NH_matching_barcode(self): - barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/') - index_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/indices/') + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + index_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/indices/')) barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*') index_parser = BarcodeParser(index_folder, lazyLoad='*') @@ -100,8 +100,8 @@ def construct_tchic_read(self,crx,ccb,trx,tcb,mr,linker): def test_TCHIC(self): - barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/') - index_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/indices/') + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + index_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/indices/')) barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*',) index_parser = BarcodeParser(index_folder, lazyLoad='*') @@ -165,8 +165,8 @@ def construct_read_pair(self, prefix, content): def test_DAMID(self): - barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/') - index_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/indices/') + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + index_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/indices/')) barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*',) index_parser = BarcodeParser(index_folder, lazyLoad='*') @@ -214,7 +214,8 @@ def test_DAMID(self): def test_3DEC_UmiBarcodeDemuxMethod_matching_barcode(self): - barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/') + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + barcode_parser = BarcodeParser(barcode_folder,lazyLoad='*') r1 = FastqRecord( diff --git a/tests/test_tagger.py b/tests/test_tagger.py index f5adf695..c0257370 100644 --- a/tests/test_tagger.py +++ b/tests/test_tagger.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import unittest -import itertools import pysam import os -import singlecellmultiomics.universalBamTagger.universalBamTagger as ut import singlecellmultiomics.universalBamTagger.bamtagmultiome as tm """