diff --git a/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py b/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py index 9d26e310..2f52466b 100644 --- a/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py +++ b/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py @@ -40,7 +40,7 @@ def metaFromRead(read, tag): # Clean a string to be able to be used in a fastq file header -fastqCleanerRegex = re.compile('[^a-zA-Z0-9-_]', re.UNICODE) +fastqCleanerRegex = re.compile('[^a-zA-Z0-9-_.]', re.UNICODE) def fqSafe(string) -> str: """ @@ -147,7 +147,13 @@ def has_tag(self, tag): return tag in self.tags def asIlluminaHeader(self): - return '{Is}:{RN}:{Fc}:{La}:{Ti}:{CX}:{CY}'.format(**self.tags) + try: + return '{Is}:{RN}:{Fc}:{La}:{Ti}:{CX}:{CY}'.format(**self.tags) + except KeyError as e: + if 'oh' in self.tags: + return self.tags['oh'] + raise e + @@ -188,6 +194,11 @@ def parse_3dec_header(self,fastqRecord, indexFileParser, indexFileAlias): 'CN': controlNumber }) + def parse_other_header(self,fastqRecord, indexFileParser, indexFileAlias): + self.tags.update({ + 'oh':fastqRecord.header[1:].replace(';','').split()[0] # original header + }) + def _parse_illumina_header(self,header, indexFileParser = None, indexFileAlias = None): @@ -270,7 +281,10 @@ def fromRawFastq( if fastqRecord.header.startswith('@Is'): self.parse_scmo_header(fastqRecord, indexFileParser, indexFileAlias) else: - self.parse_3dec_header(fastqRecord, indexFileParser, indexFileAlias) + if fastqRecord.header.startswith('@Cluster'): + self.parse_3dec_header(fastqRecord, indexFileParser, indexFileAlias) + else: + self.parse_other_header(fastqRecord, indexFileParser, indexFileAlias) # NS500413:32:H14TKBGXX:2:11101:16448:1664 1:N:0:: @@ -379,10 +393,11 @@ def fromTaggedFastq(self, fastqRecord): def fromTaggedBamRecord(self, pysamRecord): try: + for keyValue in pysamRecord.query_name.strip().split(';'): key, value = keyValue.split(':') self.addTagByTag(key, value, isPhred=False) - except ValueError: + except ValueError as e: # Try to parse "Single Cell Discoveries" header # These have the following header: #NBXXXXXX:530:HXXXXXX:2:2:17:6;SS:GTCATTAG;CB:GTCATTAG;QT:eeeeeeee;RX:CTGAAC;RQ:aaaaae;SM:SAMPLE_NAME diff --git a/singlecellmultiomics/tags/tags.py b/singlecellmultiomics/tags/tags.py index a6ee591d..d8accad5 100644 --- a/singlecellmultiomics/tags/tags.py +++ b/singlecellmultiomics/tags/tags.py @@ -42,6 +42,7 @@ def __repr__(self): SamTag('QX', 'barcodeQual', isPhred=True), SamTag('bc', 'rawBarcode'), SamTag('hd', 'hammingDistanceRawBcAssignedBc'), + SamTag('oh', 'originalHeader'), SamTag('bi', 'barcodeIndex'), diff --git a/singlecellmultiomics/universalBamTagger/bamtagmultiome.py b/singlecellmultiomics/universalBamTagger/bamtagmultiome.py index 47dd596a..eebd912d 100755 --- a/singlecellmultiomics/universalBamTagger/bamtagmultiome.py +++ b/singlecellmultiomics/universalBamTagger/bamtagmultiome.py @@ -452,7 +452,7 @@ def tag_multiome_multi_processing( # Remove the temp dir: sleep(5) try: - shutil.rmtree(temp_folder, ignore_errors=False, onerror=None) + shutil.rmtree(temp_folder, ignore_errors=False) except Exception as e: sys.stderr.write(f'Failed to remove {temp_folder}\n') sys.stderr.write(f'{e}\n') diff --git a/tests/test_demultiplexing.py b/tests/test_demultiplexing.py index 6f21e280..c494b7a2 100644 --- a/tests/test_demultiplexing.py +++ b/tests/test_demultiplexing.py @@ -248,6 +248,42 @@ def test_3DEC_UmiBarcodeDemuxMethod_matching_barcode(self): self.assertEqual( demultiplexed_record[0].tags['BC'], 'ACACACTA') self.assertEqual( demultiplexed_record[0].tags['bi'], 1) + def test_sra_header(self): + + barcode_folder = str(importlib.resources.files('singlecellmultiomics').joinpath('modularDemultiplexer/barcodes/')) + + barcode_parser = BarcodeParser(barcode_folder,lazyLoad='*') + + r1 = FastqRecord( + '@SRR21016692.1 1/1', + 'ATCACACACTATAGTCATTCAGGAGCAGGTTCTTCAGGTTCCCTGTAGTTGTGTGGTTTTGAGTGAGTTTTTTAAT', + '+', + 'AAAAA#EEEEEEEEEEEAEEEEEEEAEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEE/EEEEEEEEEEEEEEEEEE' + ) + r2 = FastqRecord( + '@SRR21016692.1 1/2', + 'ACCCCAGATCAACGTTGGACNTCNNCNTTNTNCTCNGCACCNNNNCNNNCTTATNCNNNANNNNNNNNNNTNNGN', + '+', + '6AAAAEEAEE/AEEEEEEEE#EE##<#6E#A#EEE#EAEEA####A###EE6EE#E###E##########E##A#' + ) + demux = UmiBarcodeDemuxMethod(umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + barcodeFileParser=barcode_parser, + barcodeFileAlias='maya_384NLA', + indexFileParser=None, + indexFileAlias='illumina_merged_ThruPlex48S_RP', + random_primer_read=None, + random_primer_length=6) + + demultiplexed_record = demux.demultiplex([r1,r2]) + # The barcode sequence is ACACACTA (first barcode) + self.assertEqual( demultiplexed_record[0].tags['BC'], 'ACACACTA') + self.assertEqual( demultiplexed_record[0].tags['bi'], 1) + if __name__ == '__main__': unittest.main()