From b3be9f570be3c9fb6affd3ef660b460138cdc70c Mon Sep 17 00:00:00 2001 From: Robert Castelo Date: Tue, 14 Feb 2023 18:02:27 +0100 Subject: [PATCH 1/2] Support for strandMode in GAlignmentsList objects --- DESCRIPTION | 2 +- R/GAlignmentsList-class.R | 96 ++++++++++++++++++++++- R/findOverlaps-methods.R | 35 ++++++++- R/readGAlignments.R | 22 ++++-- inst/unitTests/test_readGAlignmentsList.R | 17 +++- man/GAlignmentsList-class.Rd | 3 + man/readGAlignments.Rd | 6 +- 7 files changed, 163 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 280e30f..628dfe7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ biocViews: Infrastructure, DataImport, Genetics, Sequencing, RNASeq, SNP, URL: https://bioconductor.org/packages/GenomicAlignments Video: https://www.youtube.com/watch?v=2KqBSbkfhRo , https://www.youtube.com/watch?v=3PK_jx44QTs BugReports: https://github.com/Bioconductor/GenomicAlignments/issues -Version: 1.35.0 +Version: 1.35.1 License: Artistic-2.0 Encoding: UTF-8 Authors@R: c( diff --git a/R/GAlignmentsList-class.R b/R/GAlignmentsList-class.R index 857ad2a..9451337 100644 --- a/R/GAlignmentsList-class.R +++ b/R/GAlignmentsList-class.R @@ -6,16 +6,26 @@ setClass("GAlignmentsList", contains="CompressedRangesList", representation( + strandMode="integer", # single integer (0L, 1L, or 2L) unlistData="GAlignments", elementMetadata="DataFrame" ), prototype( + strandMode=1L, elementType="GAlignments", elementMetadata=new("DFrame") ) ) ### Formal API: +### strandMode(x) - indicates how to infer the strand of a pair from the +### strand of the first and last alignments in the pair: +### 0: strand of the pair is always *; +### 1: strand of the pair is strand of its first alignment; +### 2: strand of the pair is strand of its last alignment. +### These modes are equivalent to 'strandSpecific' equal 0, 1, +### and 2, respectively, for the featureCounts() function +### defined in the Rsubread package. ### names(x) - NULL or character vector. ### length(x) - single integer. Nb of alignments in 'x'. ### seqnames(x) - 'factor' Rle of the same length as 'x'. @@ -24,6 +34,7 @@ setClass("GAlignmentsList", ### rname(x) <- value - same as 'seqnames(x) <- value'. ### cigar(x) - character vector of the same length as 'x'. ### strand(x) - 'factor' Rle of the same length as 'x' (levels: +, -, *). +### obeys strandMode(x) (see above). ### qwidth(x) - integer vector of the same length as 'x'. ### start(x), end(x), width(x) - integer vectors of the same length as 'x'. ### njunc(x) - integer vector of the same length as 'x'. @@ -61,6 +72,10 @@ setClass("GAlignmentsList", ### Getters. ### +setMethod("strandMode", "GAlignmentsList", + function(x) x@strandMode +) + setMethod("seqnames", "GAlignmentsList", function(x) relist(seqnames(unlist(x, use.names=FALSE)), x) ) @@ -74,7 +89,26 @@ setMethod("cigar", "GAlignmentsList", ) setMethod("strand", "GAlignmentsList", - function(x) relist(strand(unlist(x, use.names=FALSE)), x) + function(x) + { + ga <- unlist(x, use.names=FALSE) + ga_mcols <- mcols(ga, use.names=FALSE) + s <- strand(unlist(x, use.names=FALSE)) + if (strandMode(x) == 0L) + s <- Rle(strand("*"), length(ga)) + else { + if (is.null(ga_mcols$flag)) + warning("Flag information missing in GAlignmentsList object. Strand information might not be accurate.") + else { + mask_first_mate <- bamFlagTest(ga_mcols$flag, "isFirstMateRead") + if (strandMode(x) == 1L) + s[!mask_first_mate] <- invertStrand(s[!mask_first_mate]) + else ## assuming strandMode(x) == 2L + s[mask_first_mate] <- invertStrand(s[mask_first_mate]) + } + } + relist(s, x) + } ) setMethod("qwidth", "GAlignmentsList", @@ -127,6 +161,23 @@ setReplaceMethod("seqnames", "GAlignmentsList", GenomicRanges:::set_CompressedGenomicRangesList_seqnames ) +setReplaceMethod("strandMode", "GAlignmentsList", + function(x, value) + { + x@strandMode <- .normarg_strandMode_replace_value(value) + x + } +) + +setMethod("invertStrand", "GAlignmentsList", + function(x) + { + strand_mode <- strandMode(x) + if (strand_mode != 0L) + strandMode(x) <- 3L - strand_mode + x + } +) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Validity. @@ -319,6 +370,47 @@ setMethod("relistToClass", "GAlignments", setMethod("show", "GAlignmentsList", function(object) - GenomicRanges:::show_GenomicRangesList(object) + showGAlignmentsList(object) ) +## adapted from GenomicRanges:::show_GenomicRangesList +## to show the right strand according to strandMode +showGAlignmentsList <- function(x) +{ + lx <- length(x) + nc <- ncol(mcols(x, use.names=FALSE)) + cat(class(x), " object with ", + lx, " ", ifelse(lx == 1L, "pair", "pairs"), + ", strandMode=", strandMode(x), + ", and ", + nc, " metadata ", ifelse(nc == 1L, "column", "columns"), + ":\n", sep="") + x_len <- length(x) + cumsumN <- end(PartitioningByEnd(x)) + N <- tail(cumsumN, 1) + if (x_len == 0L) { + cat("<0 elements>\n") + } + else if (x_len <= 3L || (x_len <= 5L && N <= 20L)) { + y <- x + strand(y) <- strand(y) ## overwrite original strand w/ one + show(as.list(y)) + } + else { + if (cumsumN[[3L]] <= 20L) { + showK <- 3L + } + else if (cumsumN[[2L]] <= 20L) { + showK <- 2L + } + else { + showK <- 1L + } + y <- x[seq_len(showK)] + strand(y) <- strand(y) ## overwrite original strand w/ one + show(as.list(y)) + diffK <- x_len - showK + cat("...\n", "<", diffK, " more element", ifelse(diffK == + 1L, "", "s"), ">\n", sep="") + } +} diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index 96dd850..0a379d2 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -94,7 +94,15 @@ setMethod("findOverlaps", c("GAlignmentsList", "Vector"), select = c("all", "first", "last", "arbitrary"), ignore.strand = FALSE) { - hits <- findOverlaps(grglist(unlist(query, use.names = FALSE)), + ## to take into account the right (real) strand, when ignore.strand=FALSE, + ## we duplicate GAligbnmentsList object in memory and update the original + ## strand with the real one, according to the strandMode parameter + ## TODO: it may be worth investigating alternative ways to avoid duplicating + ## a presumably large GAlignmentsList object + query2 <- query + if (!ignore.strand) + strand(query2) <- strand(query2) ## overwrite original strand w/ real one + hits <- findOverlaps(grglist(unlist(query2, use.names=FALSE)), subject, maxgap = maxgap, minoverlap = minoverlap, type = match.arg(type), select = match.arg(select), ignore.strand = ignore.strand) @@ -112,7 +120,15 @@ setMethod("findOverlaps", c("Vector", "GAlignmentsList"), select = c("all", "first", "last", "arbitrary"), ignore.strand = FALSE) { - hits <- findOverlaps(query, grglist(unlist(subject, use.names = FALSE)), + ## to take into account the right (real) strand, when ignore.strand=FALSE, + ## we duplicate GAligbnmentsList object in memory and update the original + ## strand with the real one, according to the strandMode parameter + ## TODO: it may be worth investigating alternative ways to avoid duplicating + ## a presumably large GAlignmentsList object + subject2 <- subject + if (!ignore.strand) + strand(subject2) <- strand(subject2) ## overwrite original strand w/ real one + hits <- findOverlaps(query, grglist(unlist(subject2, use.names = FALSE)), maxgap = maxgap, minoverlap = minoverlap, type = match.arg(type), select = match.arg(select), ignore.strand = ignore.strand) @@ -130,8 +146,19 @@ setMethod("findOverlaps", c("GAlignmentsList", "GAlignmentsList"), select = c("all", "first", "last", "arbitrary"), ignore.strand = FALSE) { - hits <- findOverlaps(grglist(unlist(query, use.names = FALSE)), - grglist(unlist(subject, use.names = FALSE)), + ## to take into account the right (real) strand, when ignore.strand=FALSE, + ## we duplicate GAligbnmentsList object in memory and update the original + ## strand with the real one, according to the strandMode parameter + ## TODO: it may be worth investigating alternative ways to avoid duplicating + ## a presumably large GAlignmentsList object + query2 <- query + subject2 <- subject + if (!ignore.strand) { + strand(query2) <- strand(query2) ## overwrite original strand w/ real one + strand(subject2) <- strand(subject2) ## overwrite original strand w/ real one + } + hits <- findOverlaps(grglist(unlist(query2, use.names = FALSE)), + grglist(unlist(subject2, use.names = FALSE)), maxgap = maxgap, minoverlap = minoverlap, type = match.arg(type), select = match.arg(select), diff --git a/R/readGAlignments.R b/R/readGAlignments.R index 17001e6..3e74fcd 100644 --- a/R/readGAlignments.R +++ b/R/readGAlignments.R @@ -290,11 +290,12 @@ setMethod("readGAlignmentPairs", "character", setGeneric("readGAlignmentsList", signature="file", function(file, index=file, use.names=FALSE, param=ScanBamParam(), - with.which_label=FALSE) + with.which_label=FALSE, strandMode=1) standardGeneric("readGAlignmentsList") ) -.matesFromBam <- function(file, use.names, param, what0, with.which_label) +.matesFromBam <- function(file, use.names, param, what0, with.which_label, + strandMode) { bamcols <- .load_bamcols_from_BamFile(file, param, what0, with.which_label=with.which_label) @@ -302,7 +303,10 @@ setGeneric("readGAlignmentsList", signature="file", gal <- GAlignments(seqnames=bamcols$rname, pos=bamcols$pos, cigar=bamcols$cigar, strand=bamcols$strand, seqlengths=seqlengths) - gal <- .bindExtraData(gal, use.names=FALSE, param, bamcols, + flag0 <- scanBamFlag() + what0 <- "flag" + param2 <- .normargParam(param, flag0, what0) + gal <- .bindExtraData(gal, use.names=FALSE, param2, bamcols, with.which_label=with.which_label) if (asMates(file)) { f <- factor(bamcols$groupid) @@ -315,35 +319,37 @@ setGeneric("readGAlignmentsList", signature="file", } if (use.names) names(gal) <- unique(splitAsList(bamcols$qname, bamcols$groupid)) + strandMode(gal) <- strandMode gal } .readGAlignmentsList.BamFile <- function(file, index=file, use.names=FALSE, param=ScanBamParam(), - with.which_label=FALSE) + with.which_label=FALSE, strandMode=1) { if (!isTRUEorFALSE(use.names)) stop("'use.names' must be TRUE or FALSE") if (!asMates(file)) bamWhat(param) <- setdiff(bamWhat(param), c("groupid", "mate_status")) - what0 <- c("rname", "strand", "pos", "cigar", "groupid", "mate_status") + what0 <- c("rname", "strand", "pos", "cigar", "groupid", "mate_status", "flag") if (use.names) what0 <- c(what0, "qname") - .matesFromBam(file, use.names, param, what0, with.which_label) + .matesFromBam(file, use.names, param, what0, with.which_label, strandMode) } setMethod("readGAlignmentsList", "BamFile", .readGAlignmentsList.BamFile) setMethod("readGAlignmentsList", "character", function(file, index=file, use.names=FALSE, param=ScanBamParam(), - with.which_label=FALSE) + with.which_label=FALSE, strandMode=1) { bam <- .open_BamFile(file, index=index, asMates=TRUE, param=param) on.exit(close(bam)) readGAlignmentsList(bam, character(0), use.names=use.names, param=param, - with.which_label=with.which_label) + with.which_label=with.which_label, + strandMode=strandMode) } ) diff --git a/inst/unitTests/test_readGAlignmentsList.R b/inst/unitTests/test_readGAlignmentsList.R index deb8f77..72b4a0a 100644 --- a/inst/unitTests/test_readGAlignmentsList.R +++ b/inst/unitTests/test_readGAlignmentsList.R @@ -51,7 +51,7 @@ test_readGAlignmentsList_mcols <- function() bf <- BamFile(chr4, asMates=TRUE, yieldSize=100) param <- ScanBamParam(tag=("NM")) galist <- readGAlignmentsList(bf, param=param) - checkIdentical(colnames(mcols(unlist(galist))), "NM") + checkIdentical(colnames(mcols(unlist(galist))), c("flag", "NM")) checkTrue(names(mcols(galist)) == "mate_status") param <- ScanBamParam(tag=("FO")) @@ -182,3 +182,18 @@ test_readGAlignmentsList_which <- function() rng2 <- as.vector(mcols(unlist(target1[2]))$which_label) checkTrue(all(rng2 %in% my_ROI_labels[4])) } + +text_readGAlignmentsList_findOverlaps <- function() +{ + fl <- system.file("extdata", "ex1.bam", package="Rsamtools") + bf <- BamFile(fl, asMates=TRUE) + galist <- readGAlignmentsList(bf, strandMode=1L) + f <- GRanges(seqnames="seq1", IRanges(30, 250), strand="-") + ov <- findOverlaps(galist[1], f, ignore.strand=FALSE) + checkIdentical(0L, length(ov)) + + galist <- readGAlignmentsList(bf, strandMode=2L) + f <- GRanges(seqnames="seq1", IRanges(30, 250), strand="-") + ov <- findOverlaps(galist[1], f, ignore.strand=FALSE) + checkIdentical(1L, length(ov)) +} diff --git a/man/GAlignmentsList-class.Rd b/man/GAlignmentsList-class.Rd index c378ffd..f1b0374 100644 --- a/man/GAlignmentsList-class.Rd +++ b/man/GAlignmentsList-class.Rd @@ -26,6 +26,9 @@ \alias{elementMetadata<-,GAlignmentsList-method} \alias{seqinfo,GAlignmentsList-method} \alias{seqinfo<-,GAlignmentsList-method} +\alias{strandMode,GAlignmentsList-method} +\alias{strandMode<-,GAlignmentsList-method} +\alias{invertStrand,GAlignmentsList-method} % Coercion: \alias{ranges,GAlignmentsList-method} diff --git a/man/readGAlignments.Rd b/man/readGAlignments.Rd index eb9c21f..9ae58a3 100644 --- a/man/readGAlignments.Rd +++ b/man/readGAlignments.Rd @@ -34,7 +34,8 @@ readGAlignmentPairs(file, index=file, use.names=FALSE, param=NULL, with.which_label=FALSE, strandMode=1) readGAlignmentsList(file, index=file, use.names=FALSE, - param=ScanBamParam(), with.which_label=FALSE) + param=ScanBamParam(), with.which_label=FALSE, + strandMode=1) readGappedReads(file, index=file, use.names=FALSE, param=NULL, with.which_label=FALSE) @@ -103,7 +104,8 @@ readGappedReads(file, index=file, use.names=FALSE, param=NULL, represented as a factor-\link[S4Vectors]{Rle}. } \item{strandMode}{ - Strand mode to set on the returned \link{GAlignmentPairs} object. + Strand mode to set on the returned \link{GAlignmentPairs} or + \link{GAlignmentsList} object. See \code{?\link{strandMode}} for more information. } } From 9dd2f368987dafe132bd5dd3c953f25090b177e5 Mon Sep 17 00:00:00 2001 From: Robert Castelo Date: Fri, 17 Feb 2023 16:27:58 +0100 Subject: [PATCH 2/2] Coercion methods between GAlignmentsList and GAlignmentPairs objects, and an updateObject method --- DESCRIPTION | 2 +- R/GAlignmentsList-class.R | 47 ++++++++++++++++++++++++++++++++++++ man/GAlignmentsList-class.Rd | 1 + 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 628dfe7..8175360 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ biocViews: Infrastructure, DataImport, Genetics, Sequencing, RNASeq, SNP, URL: https://bioconductor.org/packages/GenomicAlignments Video: https://www.youtube.com/watch?v=2KqBSbkfhRo , https://www.youtube.com/watch?v=3PK_jx44QTs BugReports: https://github.com/Bioconductor/GenomicAlignments/issues -Version: 1.35.1 +Version: 1.35.2 License: Artistic-2.0 Encoding: UTF-8 Authors@R: c( diff --git a/R/GAlignmentsList-class.R b/R/GAlignmentsList-class.R index 9451337..9c5fef2 100644 --- a/R/GAlignmentsList-class.R +++ b/R/GAlignmentsList-class.R @@ -68,6 +68,42 @@ setClass("GAlignmentsList", ### same length and class as 'x' (endomorphism). ### +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### updateObject() +### +### Internal representation of GAlignmentsList objects has changed in +### GenomicAlignments 1.35.1 (Bioc 3.17). +### + +.get_GAlignmentsList_version <- function(object) +{ + if (.hasSlot(object, "strandMode")) "current" else "< 1.35.1" +} + +setMethod("updateObject", "GAlignmentsList", + function(object, ..., verbose=FALSE) + { + ## elementType slot. + version <- .get_GAlignmentsList_version(object) + if (version == "current") { + if (verbose) + message("[updateObject] Internal representation of ", + class(object), " object is current.\n", + "[updateObject] Nothing to update.") + } else { + if (verbose) + message("[updateObject] ", class(object), " object uses ", + "internal representation from\n", + "[updateObject] GenomicAlignments ", version, ". ", + "Updating it ...") + object@strandMode <- new(class(object))@elementType + } + + callNextMethod() + } +) + + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Getters. ### @@ -187,6 +223,7 @@ setMethod("invertStrand", "GAlignmentsList", { ## TDB: Currently known pitfalls are caught by ## GAlignments validity. + .valid.GAlignmentPairs.strandMode(x) } setValidity2("GAlignmentsList", .valid.GAlignmentsList) @@ -320,8 +357,13 @@ setAs("GAlignmentPairs", "GAlignmentsList", pbe <- PartitioningByEnd() else pbe <- PartitioningByEnd(seq(2, 2*length(from), 2), names=names(from)) + mcols_gal <- cbind(mate_status=factor(rep("mated", length(from)), + levels=c("mated", "ambiguous", "unmated")), + mcols(from)) new("GAlignmentsList", + strandMode=strandMode(from), unlistData=unlist(from, use.names=FALSE), + elementMetadata=mcols_gal, partitioning=pbe) } ) @@ -334,12 +376,17 @@ setAs("GAlignmentsList", "GAlignmentPairs", first <- c(TRUE, FALSE) last <- c(FALSE, TRUE) ga_mcols <- mcols(ga, use.names=FALSE) + if (!is.null(ga_mcols$flag)) { + first <- bamFlagTest(ga_mcols$flag, "isFirstMateRead") + last <- !first + } isProperPair <- if (!is.null(ga_mcols$flag)) { bamFlagTest(ga_mcols$flag[first], "isProperPair") } else { TRUE } GAlignmentPairs(ga[first], ga[last], + strandMode=strandMode(from), isProperPair=isProperPair, names=names(ga)[first]) } diff --git a/man/GAlignmentsList-class.Rd b/man/GAlignmentsList-class.Rd index f1b0374..bc8c107 100644 --- a/man/GAlignmentsList-class.Rd +++ b/man/GAlignmentsList-class.Rd @@ -5,6 +5,7 @@ \alias{class:GAlignmentsList} \alias{GAlignmentsList-class} \alias{GAlignmentsList} +\alias{updateObject,GAlignmentsList-method} % Constructors: \alias{GAlignmentsList}