From e28cd057d4a7758c63fdb08a13a18df8d560388b Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Tue, 7 Jun 2022 14:01:08 -0400 Subject: [PATCH 01/19] Added bufferedWriter utility --- src/main/kotlin/biokotlin/util/FileIO.kt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/main/kotlin/biokotlin/util/FileIO.kt b/src/main/kotlin/biokotlin/util/FileIO.kt index b4a92097..7eba853d 100644 --- a/src/main/kotlin/biokotlin/util/FileIO.kt +++ b/src/main/kotlin/biokotlin/util/FileIO.kt @@ -2,10 +2,10 @@ package biokotlin.util -import java.io.BufferedReader -import java.io.File +import java.io.* import java.net.URL import java.util.zip.GZIPInputStream +import java.util.zip.GZIPOutputStream fun bufferedReader(filename: String): BufferedReader { return try { @@ -21,6 +21,18 @@ fun bufferedReader(filename: String): BufferedReader { File(filename).bufferedReader() } } catch (e: Exception) { - throw IllegalStateException("FileIO: getBufferedReader: problem getting reader: " + e.message) + throw IllegalStateException("FileIO: bufferedReader: problem getting reader: " + e.message) + } +} + +fun bufferedWriter(filename: String, append: Boolean = false): BufferedWriter { + return try { + if (filename.endsWith(".gz")) { + BufferedWriter(OutputStreamWriter(GZIPOutputStream(FileOutputStream(filename, append)))) + } else { + BufferedWriter(OutputStreamWriter(FileOutputStream(filename, append))) + } + } catch (e: Exception) { + throw IllegalStateException("FileIO: bufferedWriter: problem getting writer: " + e.message) } } \ No newline at end of file From b3b65915fb414a550af2b0ff2dd0115bfe7ad9e6 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Tue, 7 Jun 2022 14:02:09 -0400 Subject: [PATCH 02/19] Importing Brandon/Zach's code from Jupyter to adapt the writeBedFile function --- .../biokotlin/seqIO/MakeGffForNewSpecies.kt | 329 ++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt diff --git a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt new file mode 100644 index 00000000..034f365c --- /dev/null +++ b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt @@ -0,0 +1,329 @@ +package biokotlin.seqIO + +import biokotlin.util.bufferedWriter +import htsjdk.samtools.CigarOperator +import htsjdk.samtools.SAMRecord +import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.ValidationStringency +import java.io.File + +/** + * Feature operator classifiers + * * `U` = undefined (`S`, `H`) + * * `I` = intron (`N`) + * * `C` = coding (`M`, `D`, `I`**, `=`, `X`) <- `I` is ignored + */ +enum class FeatureOperator { + U, C, I +} + +/** + * Data class for transcript features + * * `length` = length of feature + * * `operator` = what is the feature type? (*See `FeatureOperator` for more info*) + */ +data class TranscriptFeatures( + val length: Int, + val operator: FeatureOperator, + var id: Int +) + +/** + * Data class for BED columns + * * `chrom` = contig name + * * `chromStart` = start coordinate for sequence considered + * * `chromEnd` = end coordinate for sequence considered + * * `name` = ID for range + * * `strand` = DNA strand (`+`, `-`, or `.`) + */ +data class BedFeatures( + val chrom: String, + val chromStart: Int, + val chromEnd: Int, + val name: String, + val strand: Char +) + +// ZACK FUNCTIONS ---- + +data class TranscriptBpAlignmentStats(val xOrEqArray: MutableList, + val eqArray:MutableList, + var nCounts:Int = 0, + var dCounts:Int = 0, + var numAlignments:Int = 0 ) + +enum class ExonOperator { + L, C, T +} +val exonOperatorMap = mapOf('L' to ExonOperator.L,'C' to ExonOperator.C,'T' to ExonOperator.T) +data class ExonBoundary(val length: Int, val operator: ExonOperator) + +/** + * Function to take a single exon and parse out the operator and the length of that operation. + * It needs to have a running sub list of the number characters until it sees an operator. + * Then it will convert the temp String into an Int and save out the boundary. + */ +fun convertExonStringToBoundary(exonString: String) : List { + var runningCount = StringBuilder() + val exonBoundaries = mutableListOf() + for(element in exonString) { + //Check to see if we hit an operator + if(element in exonOperatorMap.keys) { + //if so, turn the runningCount into an Int and add the boundary to the list. + val countString = runningCount.toString() + check(!countString.isEmpty()) {"Error parsing exon name. No counts for operator"} + + exonBoundaries.add(ExonBoundary(countString.toInt(), exonOperatorMap[element]!!)) + runningCount.clear() + } + else { + runningCount.append("$element") + } + } + + return exonBoundaries +} + +/** + * Function to parse out the exon boundaries from the transcript name. + * They take the form: transcript_1234:100L24C-12C-90C10T + */ +fun parseExonBoundaries(transcriptName: String) : List> { + val exonStrings = transcriptName.split(":")[1].split("-") + + return exonStrings.map { convertExonStringToBoundary(it) } +} + +/** + * Compute CDS positions (Pair) + */ +fun computeCDSPositions(exonBoundaries:List>) : Pair { + val sizesOfOperators = exonBoundaries.flatten() + .groupBy { it.operator } + .map { Pair(it.key, it.value.map { currValue -> currValue.length }.sum()) } + .toMap() + + val leaderSize = sizesOfOperators[ExonOperator.L]?:0 + + val cdsSize = sizesOfOperators[ExonOperator.C]?:0 + + return Pair(leaderSize, leaderSize + cdsSize - 1) +} + +/** + * Function that actually creates the xOrEQ array and the eqArray. It also counts the number of N and D bps + */ +fun buildTranscriptBpAlignmentStats(samRecord: SAMRecord) : TranscriptBpAlignmentStats { + val xOrEqArray = Array(samRecord.readLength) { 0 } + val eqArray = Array(samRecord.readLength) { 0 } + + var nCounts = 0 + var dCounts = 0 + + var currentBp = 0 + val cigarElements = samRecord.cigar.cigarElements + + //Loop through the cigarElements + for(element in cigarElements) { + val operator = element.operator + val count = element.length + + if(operator== CigarOperator.N) { + nCounts+=count + } + else if(operator==CigarOperator.D) { + dCounts+=count + } + //Check to see if consumes query + else if(operator.consumesReadBases()) { + //If it consumes read bases, we can walk through the length of the CIGAR operator and set the position as a 1 + for(index in 0 until count) { + if(operator.isAlignment) { + xOrEqArray[currentBp] = 1 + } + + if (operator==CigarOperator.EQ) { + eqArray[currentBp] = 1 + } + + currentBp++ + } + } + + } + + //Check to see if it was reversed during alignment. If so we need to flip our arrays. + return if(samRecord.readNegativeStrandFlag) { + TranscriptBpAlignmentStats(xOrEqArray.reversed().toMutableList(), eqArray.reversed().toMutableList(), nCounts, dCounts,1) + } + else { + TranscriptBpAlignmentStats(xOrEqArray.toMutableList(), eqArray.toMutableList(),nCounts, dCounts,1) + } + +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param featureLengths A list of `TranscriptFeature` data objects + * @param samRecord A `SAMRecord` object + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun calculateRanges(featureLengths: List, samRecord: SAMRecord, taxaId: String): MutableList { + var aggLength = samRecord.alignmentStart + val contig = samRecord.referenceName + val bedFeatures = mutableListOf() + val zmTranscriptRanges = parseExonBoundaries(samRecord.readName).flatten() + val strand = when { + samRecord.readNegativeStrandFlag -> '-' + else -> '+' + } + val prefixReadName = samRecord.readName.substringBefore(":") + + for (feature in featureLengths) { + if (feature.operator == FeatureOperator.C || feature.operator == FeatureOperator.I) { + bedFeatures.add( + BedFeatures( + contig, + (aggLength) - 1, + (aggLength + feature.length - 1), + "${taxaId}_${prefixReadName}_${feature.operator}.${feature.id}", + strand + ) + ) + aggLength += feature.length + } else { + continue + } + } + return bedFeatures +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param samRecord A SAM record entry + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun buildFeatureRanges(samRecord: SAMRecord, taxaId: String): MutableList { + val cigarElements = samRecord.cigar.cigarElements + val transcriptFeatures = mutableListOf() + + // Loop through CIGAR elements + var sumLength = 0 + var i = 0 + var iCount = 1 + var cCount = 1 + var uCount = 1 + for (element in cigarElements) { + val operator = element.operator + + if (operator != CigarOperator.I) { + sumLength += element.length + } + // FIX - change to last iterator [prev: element == cigarElements.last()] + if (i == cigarElements.lastIndex && (operator.isAlignment || operator == CigarOperator.D)) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + break + } + // TODO: 2/6/2022 - how to handle I/D elements? + when (operator) { + CigarOperator.S, CigarOperator.H -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.U, uCount)) + sumLength = 0 + uCount++ + } + CigarOperator.N -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.I, iCount)) + sumLength = 0 + iCount++ + } + else -> { + if (cigarElements[i + 1].operator == CigarOperator.N || cigarElements[i + 1].operator.isClipping) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + cCount++ + } + } + + } + + i++ + } + + // Check to see if it was reversed during alignment. If so, we need to reverse the ID order (e.g. 1-18 -> 18-1). + val featureLengths = when { + samRecord.readNegativeStrandFlag -> { + val idList = mutableListOf() + transcriptFeatures.forEach { idList.add(it.id) } + val revIdList = idList.reversed() + var j = 0 + transcriptFeatures.forEach { + it.id = revIdList[j] + j++ + } + transcriptFeatures.toMutableList() + } + else -> { + transcriptFeatures + } + } + + return calculateRanges(featureLengths, samRecord, taxaId) +} + +/** + * Writes a BED file based on a SAM file. Generates coordinates for + * introns (I), coding regions (C), and whole transcripts. IDs (e.g. C.1) + * are based on strand (+, -). + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minAlignmentPercent Percent match threshold to alignment region. + */ +fun writeBedFile(samFile: String, outputFile: String, taxaId: String, minAlignmentPercent: Double = 0.90) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minAlignmentPercent) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges + val transId = "${currentRecord.referenceName}\t${currentRecord.alignmentStart - 1}\t${currentRecord.alignmentEnd}\t${taxaId}_${currentRecord.readName.substringBefore(":")}\t0\t${bedStats[0].strand}\n" + bw.write(transId) + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} \ No newline at end of file From d7adca76911f2f3bc1435cee5e8d0c940ff7d965 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Tue, 7 Jun 2022 15:27:48 -0400 Subject: [PATCH 03/19] Initial pass of adapting it to make GFF --- .../biokotlin/seqIO/MakeGffForNewSpecies.kt | 108 +++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt index 034f365c..db3cadb4 100644 --- a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt +++ b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt @@ -42,7 +42,44 @@ data class BedFeatures( val chromEnd: Int, val name: String, val strand: Char -) + ) + +/** + * Data class for GFF columns (descriptions from https://useast.ensembl.org/info/website/upload/gff3.html) + * * seqid - name of the chromosome or scaffold + * * source - name of the program that generated this feature, or the data source (database/project name) + * * type - type of feature //TODO check which types our features can be (presumably only a subset of all possible types) + * * start - start position (STARTS COUNTING AT 1) + * * end - end position (STARTS COUNTING AT 1, INCLUSIVE) + * * score - floating point value + * * strand - '+', '-', or '.' + * * phase - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, + * '1' that the second base is the first base of a codon, and so on.. + * * attributes - a map of tag-value pairs. Attribute tags with special meanings found at http://gmod.org/wiki/GFF3#GFF3_Format + * * key attributes: Parent, ID, Name (can be same as ID) + */ +data class GffFeatures( + val seqid: String, + val source: String, + val type: String, + val start: Int, + val end: Int, + val score: Double, + val strand: Char, + val phase: Int, + val attributes: Map, +) { + /** + * Converts this GffFeature into a GFF column + */ + fun asColumn(): String { + val sb = StringBuilder() + for ((tag, value) in attributes) { + sb.append("$tag=$value;") + } + return "$seqid\t$source\t$type\t$start\t$end\t$score\t$strand\t$phase\t$sb\n" + } +} // ZACK FUNCTIONS ---- @@ -323,6 +360,75 @@ fun writeBedFile(samFile: String, outputFile: String, taxaId: String, minAlignme } bw.close() + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} + +/** + * Writes a GFF file based on the SAM file. + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minQuality Percent match threshold to alignment region. + * @param maxNumber //TODO how to define this? + */ +fun writeGffFile(samFile: String, outputFile: String, taxaId: String, minQuality: Double = 0.90, maxNumber: Int = 1) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minQuality) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges //TODO make these GFF + + //TODO does this parent represent mRNA? + val mRNA = GffFeatures( + currentRecord.referenceName, + "SOURCE", + "mRNA", + currentRecord.alignmentStart, + currentRecord.alignmentEnd, + 0.0, + bedStats[0].strand, + + ) + val transId = "${currentRecord.referenceName}\t${currentRecord.alignmentStart - 1}\t${currentRecord.alignmentEnd}\t${taxaId}_${currentRecord.readName.substringBefore(":")}\t0\t${bedStats[0].strand}\n" + bw.write(transId) + //TODO key line to change + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + // Get time stats ---- val timeEnd = System.currentTimeMillis() - timeStart println("Elapsed BED creation time: $timeEnd ms") From 7985c56a13f0318df95af734bbd9a68d7dff030b Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Wed, 8 Jun 2022 11:24:18 -0400 Subject: [PATCH 04/19] Moved MakeGffForNewSpecies to gff package --- .../kotlin/biokotlin/{seqIO => gff}/MakeGffForNewSpecies.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/main/kotlin/biokotlin/{seqIO => gff}/MakeGffForNewSpecies.kt (99%) diff --git a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt similarity index 99% rename from src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt rename to src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt index db3cadb4..d56466b0 100644 --- a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt +++ b/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt @@ -1,4 +1,4 @@ -package biokotlin.seqIO +package biokotlin.gff import biokotlin.util.bufferedWriter import htsjdk.samtools.CigarOperator From e9bbd0b8d85e525afbff1323ed6c1c03c759ac7a Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Wed, 8 Jun 2022 11:25:51 -0400 Subject: [PATCH 05/19] Comitting day 2 of hackathon work --- .../biokotlin/seqIO/MakeGffForNewSpecies.kt | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt index db3cadb4..15938ae9 100644 --- a/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt +++ b/src/main/kotlin/biokotlin/seqIO/MakeGffForNewSpecies.kt @@ -53,7 +53,7 @@ data class BedFeatures( * * end - end position (STARTS COUNTING AT 1, INCLUSIVE) * * score - floating point value * * strand - '+', '-', or '.' - * * phase - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, + * * phase - One of '.', '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, * '1' that the second base is the first base of a codon, and so on.. * * attributes - a map of tag-value pairs. Attribute tags with special meanings found at http://gmod.org/wiki/GFF3#GFF3_Format * * key attributes: Parent, ID, Name (can be same as ID) @@ -64,15 +64,15 @@ data class GffFeatures( val type: String, val start: Int, val end: Int, - val score: Double, + val score: String, val strand: Char, - val phase: Int, + val phase: Char, val attributes: Map, ) { /** - * Converts this GffFeature into a GFF column + * Converts this GffFeature into a GFF row */ - fun asColumn(): String { + fun asRow(): String { val sb = StringBuilder() for ((tag, value) in attributes) { sb.append("$tag=$value;") @@ -406,19 +406,21 @@ fun writeGffFile(samFile: String, outputFile: String, taxaId: String, minQuality if (alignmentPercentage >= minQuality) { val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges //TODO make these GFF + val topID = "${taxaId}_${currentRecord.readName.substringBefore(":")}"; //TODO does this parent represent mRNA? + //TODO fill in the values val mRNA = GffFeatures( currentRecord.referenceName, "SOURCE", "mRNA", currentRecord.alignmentStart, currentRecord.alignmentEnd, - 0.0, + ".", bedStats[0].strand, - + '.', + mapOf("ID" to topID, "Name" to topID) ) - val transId = "${currentRecord.referenceName}\t${currentRecord.alignmentStart - 1}\t${currentRecord.alignmentEnd}\t${taxaId}_${currentRecord.readName.substringBefore(":")}\t0\t${bedStats[0].strand}\n" - bw.write(transId) + bw.write(mRNA.asRow()) //TODO key line to change bedStats.forEach { bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") From ee35e444808eaf66394adfb952d7bfac812203a0 Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Thu, 9 Jun 2022 11:46:16 -0400 Subject: [PATCH 06/19] Adding initial GFF Feature data structures --- src/main/kotlin/biokotlin/gff/Coding.kt | 20 +++++++ src/main/kotlin/biokotlin/gff/Exon.kt | 29 +++++++++ src/main/kotlin/biokotlin/gff/Feature.kt | 65 +++++++++++++++++++++ src/main/kotlin/biokotlin/gff/Gene.kt | 22 +++++++ src/main/kotlin/biokotlin/gff/Intron.kt | 11 ++++ src/main/kotlin/biokotlin/gff/Leader.kt | 20 +++++++ src/main/kotlin/biokotlin/gff/MRNA.kt | 48 +++++++++++++++ src/main/kotlin/biokotlin/gff/Terminator.kt | 20 +++++++ 8 files changed, 235 insertions(+) create mode 100644 src/main/kotlin/biokotlin/gff/Coding.kt create mode 100644 src/main/kotlin/biokotlin/gff/Exon.kt create mode 100644 src/main/kotlin/biokotlin/gff/Feature.kt create mode 100644 src/main/kotlin/biokotlin/gff/Gene.kt create mode 100644 src/main/kotlin/biokotlin/gff/Intron.kt create mode 100644 src/main/kotlin/biokotlin/gff/Leader.kt create mode 100644 src/main/kotlin/biokotlin/gff/MRNA.kt create mode 100644 src/main/kotlin/biokotlin/gff/Terminator.kt diff --git a/src/main/kotlin/biokotlin/gff/Coding.kt b/src/main/kotlin/biokotlin/gff/Coding.kt new file mode 100644 index 00000000..7b431ade --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Coding.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as CDS + */ +class Coding( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Coding + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Exon.kt b/src/main/kotlin/biokotlin/gff/Exon.kt new file mode 100644 index 00000000..49f15d8a --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Exon.kt @@ -0,0 +1,29 @@ +package biokotlin.gff + +class Exon( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Exon + + fun coding(): List { + return children.filterIsInstance().toList() + } + + fun leaders(): List { + return children.filterIsInstance().toList() + } + + fun terminators(): List { + return children.filterIsInstance().toList() + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt new file mode 100644 index 00000000..490e6c74 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -0,0 +1,65 @@ +package biokotlin.gff + +enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron } + +/** + * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain + * any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not + * contain unescaped whitespace and must not begin with an unescaped ">". + * @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param end The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that + * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, + * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type + * creating a new composite type that is a subclass of the type in the type column. + * @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics + * of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, + * and that P-values be used for ab initio gene prediction features. + * @param strand The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, + * and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant, + * but unknown. + * @param phase For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end + * (where the 5' end of the CDS is relative to the strand of the CDS feature) of the current CDS feature. For + * clarification the 5' end for CDS features on the plus strand is the feature's start and and the 5' end for CDS + * features on the minus strand is the feature's end. The phase is one of the integers 0, 1, or 2, indicating the + * number of bases forward from the start of the current CDS feature the next codon begins. A phase of "0" indicates + * that a codon begins on the first nucleotide of the CDS feature (i.e. 0 bases forward), a phase of "1" indicates + * that the codon begins at the second nucleotide of this CDS feature and a phase of "2" indicates that the codon + * begins at the third nucleotide of this region. Note that ‘Phase’ in the context of a GFF3 CDS feature should not + * be confused with the similar concept of frame that is also a common concept in bioinformatics. Frame is generally + * calculated as a value for a given base relative to the start of the complete open reading frame (ORF) or the + * codon (e.g. modulo 3) while CDS phase describes the start of the next codon relative to a given CDS feature. + * @param attributes A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated + * by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces + * are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be + * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. + * @param children + */ +abstract class Feature( + val seqid: String, + val start: Int, + val end: Int, + val source: String? = null, + val score: Double = 0.0, + val strand: String = "+", + val phase: String = ".", + var attributes: Map = emptyMap(), + var children: List = emptyList() +) { + + init { + attributes = attributes.toMap() + children = children.toList().sortedBy { it.start } + } + + abstract fun type(): FeatureType + + fun attribute(key: String) = attributes[key] + +} diff --git a/src/main/kotlin/biokotlin/gff/Gene.kt b/src/main/kotlin/biokotlin/gff/Gene.kt new file mode 100644 index 00000000..098dc083 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Gene.kt @@ -0,0 +1,22 @@ +package biokotlin.gff + + +class Gene( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Gene + + fun mRNAs(): List { + return children.filterIsInstance().toList() + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Intron.kt b/src/main/kotlin/biokotlin/gff/Intron.kt new file mode 100644 index 00000000..6595ec15 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Intron.kt @@ -0,0 +1,11 @@ +package biokotlin.gff + +class Intron( + seqid: String, + start: Int, + end: Int +) : Feature(seqid, start, end) { + + override fun type(): FeatureType = FeatureType.Intron + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Leader.kt b/src/main/kotlin/biokotlin/gff/Leader.kt new file mode 100644 index 00000000..7f8bfd7f --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Leader.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as 5' UTR + */ +class Leader( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Leader + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt new file mode 100644 index 00000000..dbe30140 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/MRNA.kt @@ -0,0 +1,48 @@ +package biokotlin.gff + +/** + * Also known as a Transcript + */ +class MRNA( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.mRNA + + fun coding(): List { + return children.filterIsInstance().toList() + } + + fun exons(): List { + return children.filterIsInstance().toList() + } + + fun leaders(): List { + return children.filterIsInstance().toList() + } + + fun terminators(): List { + return children.filterIsInstance().toList() + } + + fun introns(): List { + return children + .chunked(2) + .filter { it.size == 2 } + .map { + val seqid = it[0].seqid + val start = it[0].end + val end = it[1].start + Intron(seqid, start, end) + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Terminator.kt b/src/main/kotlin/biokotlin/gff/Terminator.kt new file mode 100644 index 00000000..e17d66bc --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Terminator.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as 3' UTR + */ +class Terminator( + seqid: String, + start: Int, + end: Int, + source: String? = null, + score: Double = 0.0, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Terminator + +} \ No newline at end of file From c11c45fc2b9fd238fe534ca0cf9c77246ead6cc8 Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Thu, 9 Jun 2022 14:56:45 -0400 Subject: [PATCH 07/19] Adding initial GFF Feature data structures --- src/main/kotlin/biokotlin/gff/Chromosome.kt | 14 ++++++++++++++ src/main/kotlin/biokotlin/gff/Coding.kt | 4 ++-- src/main/kotlin/biokotlin/gff/Exon.kt | 4 ++-- src/main/kotlin/biokotlin/gff/Feature.kt | 12 ++++++------ src/main/kotlin/biokotlin/gff/Gene.kt | 4 ++-- src/main/kotlin/biokotlin/gff/Intron.kt | 3 ++- src/main/kotlin/biokotlin/gff/Leader.kt | 4 ++-- src/main/kotlin/biokotlin/gff/MRNA.kt | 7 ++++--- src/main/kotlin/biokotlin/gff/Terminator.kt | 4 ++-- 9 files changed, 36 insertions(+), 20 deletions(-) create mode 100644 src/main/kotlin/biokotlin/gff/Chromosome.kt diff --git a/src/main/kotlin/biokotlin/gff/Chromosome.kt b/src/main/kotlin/biokotlin/gff/Chromosome.kt new file mode 100644 index 00000000..18af4181 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Chromosome.kt @@ -0,0 +1,14 @@ +package biokotlin.gff + +class Chromosome ( + seqid: String, + source: String, + start: Int, + end: Int, + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, attributes = attributes, children = children) { + + override fun type(): FeatureType = FeatureType.Chromosome + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Coding.kt b/src/main/kotlin/biokotlin/gff/Coding.kt index 7b431ade..6298253e 100644 --- a/src/main/kotlin/biokotlin/gff/Coding.kt +++ b/src/main/kotlin/biokotlin/gff/Coding.kt @@ -5,15 +5,15 @@ package biokotlin.gff */ class Coding( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.Coding diff --git a/src/main/kotlin/biokotlin/gff/Exon.kt b/src/main/kotlin/biokotlin/gff/Exon.kt index 49f15d8a..274735d9 100644 --- a/src/main/kotlin/biokotlin/gff/Exon.kt +++ b/src/main/kotlin/biokotlin/gff/Exon.kt @@ -2,15 +2,15 @@ package biokotlin.gff class Exon( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.Exon diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index 490e6c74..6f57894c 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -1,11 +1,15 @@ package biokotlin.gff -enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron } +enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome } /** * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain * any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not * contain unescaped whitespace and must not begin with an unescaped ">". + * @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that + * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, + * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type + * creating a new composite type that is a subclass of the type in the type column. * @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to @@ -14,10 +18,6 @@ enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron } * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. - * @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that - * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, - * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type - * creating a new composite type that is a subclass of the type in the type column. * @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics * of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, * and that P-values be used for ab initio gene prediction features. @@ -43,9 +43,9 @@ enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron } */ abstract class Feature( val seqid: String, + val source: String, val start: Int, val end: Int, - val source: String? = null, val score: Double = 0.0, val strand: String = "+", val phase: String = ".", diff --git a/src/main/kotlin/biokotlin/gff/Gene.kt b/src/main/kotlin/biokotlin/gff/Gene.kt index 098dc083..f96c980b 100644 --- a/src/main/kotlin/biokotlin/gff/Gene.kt +++ b/src/main/kotlin/biokotlin/gff/Gene.kt @@ -3,15 +3,15 @@ package biokotlin.gff class Gene( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.Gene diff --git a/src/main/kotlin/biokotlin/gff/Intron.kt b/src/main/kotlin/biokotlin/gff/Intron.kt index 6595ec15..d11a429a 100644 --- a/src/main/kotlin/biokotlin/gff/Intron.kt +++ b/src/main/kotlin/biokotlin/gff/Intron.kt @@ -2,9 +2,10 @@ package biokotlin.gff class Intron( seqid: String, + source: String, start: Int, end: Int -) : Feature(seqid, start, end) { +) : Feature(seqid, source, start, end) { override fun type(): FeatureType = FeatureType.Intron diff --git a/src/main/kotlin/biokotlin/gff/Leader.kt b/src/main/kotlin/biokotlin/gff/Leader.kt index 7f8bfd7f..a6e32f17 100644 --- a/src/main/kotlin/biokotlin/gff/Leader.kt +++ b/src/main/kotlin/biokotlin/gff/Leader.kt @@ -5,15 +5,15 @@ package biokotlin.gff */ class Leader( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.Leader diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt index dbe30140..fe31bfe6 100644 --- a/src/main/kotlin/biokotlin/gff/MRNA.kt +++ b/src/main/kotlin/biokotlin/gff/MRNA.kt @@ -5,15 +5,15 @@ package biokotlin.gff */ class MRNA( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.mRNA @@ -39,9 +39,10 @@ class MRNA( .filter { it.size == 2 } .map { val seqid = it[0].seqid + val source = it[0].source val start = it[0].end val end = it[1].start - Intron(seqid, start, end) + Intron(seqid, source, start, end) } } diff --git a/src/main/kotlin/biokotlin/gff/Terminator.kt b/src/main/kotlin/biokotlin/gff/Terminator.kt index e17d66bc..0b71033f 100644 --- a/src/main/kotlin/biokotlin/gff/Terminator.kt +++ b/src/main/kotlin/biokotlin/gff/Terminator.kt @@ -5,15 +5,15 @@ package biokotlin.gff */ class Terminator( seqid: String, + source: String, start: Int, end: Int, - source: String? = null, score: Double = 0.0, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), children: List = emptyList() -) : Feature(seqid, start, end, source, score, strand, phase, attributes, children) { +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { override fun type(): FeatureType = FeatureType.Terminator From 40f4c572f198a0b6a423f8f23b295f1dd1fb3d35 Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Thu, 9 Jun 2022 15:37:43 -0400 Subject: [PATCH 08/19] Adding initial GFF Feature data structures --- src/main/kotlin/biokotlin/gff/Feature.kt | 22 +++++++++ .../kotlin/biokotlin/gff/FeatureBuilder.kt | 47 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 src/main/kotlin/biokotlin/gff/FeatureBuilder.kt diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index 6f57894c..e73899e4 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -62,4 +62,26 @@ abstract class Feature( fun attribute(key: String) = attributes[key] + fun parent() = attributes["Parent"] + + fun id() = attributes["ID"] + + fun name() = attributes["Name"] + + fun alias() = attributes["Alias"] + + fun target() = attributes["Target"] + + fun gap() = attributes["Gap"] + + fun derivesFrom() = attributes["Derives_from"] + + fun note() = attributes["Note"] + + fun dbxref() = attributes["Dbxref"] + + fun ontologyTerm() = attributes["Ontology_term"] + + fun isCircular() = attributes["Is_circular"] + } diff --git a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt new file mode 100644 index 00000000..6fc1215d --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt @@ -0,0 +1,47 @@ +package biokotlin.gff + +class FeatureBuilder( + val seqid: String, + val source: String, + val type: FeatureType, + val start: Int, + val end: Int, + val score: Double = 0.0, + val strand: String = "+", + val phase: String = ".", + var attributes: Map = emptyMap() +) { + + init { + id()?.let { builderMap[it] = this } + } + + val children = mutableListOf() + + fun id() = attributes["ID"] + + fun add(child: FeatureBuilder) { + children.add(child) + } + + fun build(): Feature { + + val children = children.map { it.build() } + return when (type) { + FeatureType.Gene -> Gene(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Exon -> Exon(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Leader -> Leader(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Terminator -> Terminator(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Coding -> Coding(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.mRNA -> MRNA(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Intron -> Intron(seqid, source, start, end) + FeatureType.Chromosome -> Chromosome(seqid, source, start, end, attributes, children) + } + + } + +} + +private val builderMap = mutableMapOf() + +fun featureBuilder(id: String) = builderMap[id] \ No newline at end of file From 4657027ae6e4f0a733ef0b781090331f7f5a4a87 Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Fri, 10 Jun 2022 10:28:58 -0400 Subject: [PATCH 09/19] Adding initial GFF Feature data structures --- src/main/kotlin/biokotlin/gff/Coding.kt | 2 +- src/main/kotlin/biokotlin/gff/Exon.kt | 2 +- src/main/kotlin/biokotlin/gff/Feature.kt | 2 +- src/main/kotlin/biokotlin/gff/Gene.kt | 2 +- src/main/kotlin/biokotlin/gff/Leader.kt | 2 +- src/main/kotlin/biokotlin/gff/MRNA.kt | 2 +- src/main/kotlin/biokotlin/gff/Terminator.kt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/kotlin/biokotlin/gff/Coding.kt b/src/main/kotlin/biokotlin/gff/Coding.kt index 6298253e..21d692c0 100644 --- a/src/main/kotlin/biokotlin/gff/Coding.kt +++ b/src/main/kotlin/biokotlin/gff/Coding.kt @@ -8,7 +8,7 @@ class Coding( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/Exon.kt b/src/main/kotlin/biokotlin/gff/Exon.kt index 274735d9..65e9d1f4 100644 --- a/src/main/kotlin/biokotlin/gff/Exon.kt +++ b/src/main/kotlin/biokotlin/gff/Exon.kt @@ -5,7 +5,7 @@ class Exon( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index e73899e4..c23a244a 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -46,7 +46,7 @@ abstract class Feature( val source: String, val start: Int, val end: Int, - val score: Double = 0.0, + val score: Double = Double.NaN, val strand: String = "+", val phase: String = ".", var attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/Gene.kt b/src/main/kotlin/biokotlin/gff/Gene.kt index f96c980b..3f945339 100644 --- a/src/main/kotlin/biokotlin/gff/Gene.kt +++ b/src/main/kotlin/biokotlin/gff/Gene.kt @@ -6,7 +6,7 @@ class Gene( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/Leader.kt b/src/main/kotlin/biokotlin/gff/Leader.kt index a6e32f17..c2518a5c 100644 --- a/src/main/kotlin/biokotlin/gff/Leader.kt +++ b/src/main/kotlin/biokotlin/gff/Leader.kt @@ -8,7 +8,7 @@ class Leader( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt index fe31bfe6..5a27d2ed 100644 --- a/src/main/kotlin/biokotlin/gff/MRNA.kt +++ b/src/main/kotlin/biokotlin/gff/MRNA.kt @@ -8,7 +8,7 @@ class MRNA( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), diff --git a/src/main/kotlin/biokotlin/gff/Terminator.kt b/src/main/kotlin/biokotlin/gff/Terminator.kt index 0b71033f..9b6e5a12 100644 --- a/src/main/kotlin/biokotlin/gff/Terminator.kt +++ b/src/main/kotlin/biokotlin/gff/Terminator.kt @@ -8,7 +8,7 @@ class Terminator( source: String, start: Int, end: Int, - score: Double = 0.0, + score: Double = Double.NaN, strand: String = "+", phase: String = ".", attributes: Map = emptyMap(), From 7c6d15fb9919bce931102ce872368e93f55a1290 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 10:48:05 -0400 Subject: [PATCH 10/19] First draft of parser --- src/main/kotlin/biokotlin/gff/GffReader.kt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 src/main/kotlin/biokotlin/gff/GffReader.kt diff --git a/src/main/kotlin/biokotlin/gff/GffReader.kt b/src/main/kotlin/biokotlin/gff/GffReader.kt new file mode 100644 index 00000000..5f3a8642 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/GffReader.kt @@ -0,0 +1,2 @@ +package biokotlin.gff + From 039da2df1abd2582d176f04622a5df147485de4d Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 10:48:31 -0400 Subject: [PATCH 11/19] Added convenience method for converting from string to feature type --- src/main/kotlin/biokotlin/gff/Feature.kt | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index e73899e4..319e87ce 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -1,6 +1,27 @@ package biokotlin.gff -enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome } +enum class FeatureType { + Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome; + + companion object { + /** + * Converts from standard names in GFF files to a FeatureType. Case-insensitive + */ + fun fromGffString(gffString: String): FeatureType { + return when (gffString.lowercase()) { + "gene" -> Gene + "exon" -> Exon + "five_prime_utr" -> Leader + "three_prime_utr" -> Terminator + "cds" -> Coding + "mrna" -> mRNA + "intron" -> Intron + "chromosome" -> Chromosome + else -> throw Exception("Feature $gffString is not supported") + } + } + } +} /** * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain From 0454d78d68e0c7c8b7102f1cbfdebce94d7cf9fd Mon Sep 17 00:00:00 2001 From: Terry Casstevens Date: Fri, 10 Jun 2022 10:48:38 -0400 Subject: [PATCH 12/19] Adding initial GFF Feature data structures --- src/main/kotlin/biokotlin/gff/Feature.kt | 2 +- src/main/kotlin/biokotlin/gff/FeatureBuilder.kt | 1 + src/main/kotlin/biokotlin/gff/Scaffold.kt | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 src/main/kotlin/biokotlin/gff/Scaffold.kt diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index c23a244a..2ec5f264 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -1,6 +1,6 @@ package biokotlin.gff -enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome } +enum class FeatureType { Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome, Scaffold } /** * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain diff --git a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt index 6fc1215d..9abe9bfc 100644 --- a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt +++ b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt @@ -36,6 +36,7 @@ class FeatureBuilder( FeatureType.mRNA -> MRNA(seqid, source, start, end, score, strand, phase, attributes, children) FeatureType.Intron -> Intron(seqid, source, start, end) FeatureType.Chromosome -> Chromosome(seqid, source, start, end, attributes, children) + FeatureType.Scaffold -> Scaffold(seqid, source, start, end, attributes, children) } } diff --git a/src/main/kotlin/biokotlin/gff/Scaffold.kt b/src/main/kotlin/biokotlin/gff/Scaffold.kt new file mode 100644 index 00000000..476f5369 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Scaffold.kt @@ -0,0 +1,14 @@ +package biokotlin.gff + +class Scaffold( + seqid: String, + source: String, + start: Int, + end: Int, + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, attributes = attributes, children = children) { + + override fun type(): FeatureType = FeatureType.Scaffold + +} \ No newline at end of file From ee5152317e16afbb19029ee3b22cfc029c761595 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 15:10:13 -0400 Subject: [PATCH 13/19] Added toString and lazyEquals methods --- src/main/kotlin/biokotlin/gff/Feature.kt | 69 +++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index f16c558b..de418b85 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -17,6 +17,7 @@ enum class FeatureType { "mrna" -> mRNA "intron" -> Intron "chromosome" -> Chromosome + "scaffold" -> Scaffold else -> throw Exception("Feature $gffString is not supported") } } @@ -105,4 +106,70 @@ abstract class Feature( fun isCircular() = attributes["Is_circular"] -} + /** + * Converts to a DOT file (without header), for graph visualization + */ + fun toDot(): String { + val sb = StringBuilder() + for (child in children) { + sb.append("\"$this\" -> \"$child\"\n") + sb.append(child.toDot()) + } + sb.append("\"$this\"[label = \"$seqid: ${type()} $start-$end\"]") + return sb.toString() + } + + /** + * Compares if this and other have the same String representation, and recurisvely checks that for the children. + * It is possible for two things to be equal even if their children are not in the same order (sometimes non-equal + * children have the same start so they get sorted arbitrarily). + */ + fun lazyEquals(other: Feature): Boolean { + val equalOnThisLevel = seqid == other.seqid && + source == other.source && + start == other.start && + end == other.end && + strand == other.strand && + phase == other.phase && + id() == other.id() && + parent() == other.parent() + if (!equalOnThisLevel) { + return false + } else if (children.isEmpty() && other.children.isEmpty()){ + return true + } + else { + val aggregatedChildren = children.map { it.toString() }.reduce{ acc, string -> acc + string } + val aggregatedOtherChildren = other.children.map { it.toString() }.reduce{ acc, string -> acc + string } + for (i in children.indices) { + if (!aggregatedOtherChildren.contains(children[i].toString())) { + return false + } + } + for (i in other.children.indices) { + if(!aggregatedChildren.contains(other.children[i].toString())) { + return false + } + } + } + return true + } + + /** + * Returns the feature as a string representing row in a GFF file + */ + override fun toString(): String { + val scoreString = if (score.isNaN()) { + "." + } else { + score.toString() + } + + val attributesString = StringBuilder() + for ((tag, value) in attributes) { + attributesString.append("$tag:$value;") + } + + return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" + } +} \ No newline at end of file From 30386514fca6fbd6982aa5d2d3c9e99e42af3fed Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 15:10:55 -0400 Subject: [PATCH 14/19] Added GFF parser, tested to see if it works with scrambled orders (it does but my test is messy) --- src/main/kotlin/biokotlin/gff/GffReader.kt | 91 ++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/main/kotlin/biokotlin/gff/GffReader.kt b/src/main/kotlin/biokotlin/gff/GffReader.kt index 5f3a8642..aaf67613 100644 --- a/src/main/kotlin/biokotlin/gff/GffReader.kt +++ b/src/main/kotlin/biokotlin/gff/GffReader.kt @@ -1,2 +1,93 @@ package biokotlin.gff +import biokotlin.genome.* +import biokotlin.util.bufferedReader +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.* +/** + * Parses a GFF and returns a list of top-level features in order of their start + * @param gff Path to gff file + */ +fun parseGff(gff: String): List { + val startTime = System.currentTimeMillis() + val file = bufferedReader(gff) + val roots = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val orphans = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val registry = HashMap(0) //id to feature builder + + var line = file.readLine() + var lineNumber = 0 + while (line != null) { + if (line.startsWith("#")) { + line = file.readLine() + lineNumber++ + continue + } + + val split = line.split("\t") + val attributes = split[8].split(";") + val attributeMap = HashMap(0) + attributes.forEach { + val splitAttribute = it.split("=") + attributeMap[splitAttribute[0]] = splitAttribute[1] + } + val score = if (split[5] == ".") { + Double.NaN + } else { + split[5].toDouble() + } + val featureBuilder = FeatureBuilder( + split[0], + split[1], + FeatureType.fromGffString(split[2]), + split[3].toInt(), + split[4].toInt(), + score, + split[6], + split[7], + attributeMap + ) + + if (featureBuilder.attributes["ID"] != null) { + registry[featureBuilder.attributes["ID"]!!] = featureBuilder + } + + if (featureBuilder.attributes["Parent"] != null) { + if (registry.contains(featureBuilder.attributes["Parent"])) { + registry[featureBuilder.attributes["Parent"]]?.add(featureBuilder) + } else { + orphans.add(featureBuilder) + } + } else { + roots.add(featureBuilder) + } + + line = file.readLine() + lineNumber++ + } + + for (orphan in orphans) { + if (registry.contains(orphan.attributes["Parent"])) { + registry[orphan.attributes["Parent"]]?.add(orphan) + } else { + roots.add(orphan) + println("Warning: Orphaned element. Parent ${orphan.attributes["Parent"]} is not in the file") + } + } + + println("Time: ${System.currentTimeMillis() - startTime}ms") + return roots.sortedBy { it.start } .map { it.build() } +} + +fun main() { + val roots = parseGff("/home/jeff/Buckler/Biokotlin/b73.gff") + val shuffledRoots = parseGff("/home/jeff/Buckler/Biokotlin/b73_shuffled.gff") + + var errors = 0 + for (i in 0 until roots.size) { + if (!roots[i].lazyEquals(shuffledRoots[i])) { + errors++ + } + } + println("Number of errors $errors") +} \ No newline at end of file From 748a3878e0f382071394812d957e9b1316a78cba Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 15:16:23 -0400 Subject: [PATCH 15/19] Corrected bug in Feature toString --- src/main/kotlin/biokotlin/gff/Feature.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index de418b85..cb1235bc 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -167,7 +167,7 @@ abstract class Feature( val attributesString = StringBuilder() for ((tag, value) in attributes) { - attributesString.append("$tag:$value;") + attributesString.append("$tag=$value;") } return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" From da41058b1a794355a84b1970d15abb9eebefff94 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 10 Jun 2022 15:27:42 -0400 Subject: [PATCH 16/19] Made parser sort for seqid instead of just start --- src/main/kotlin/biokotlin/gff/GffReader.kt | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/biokotlin/gff/GffReader.kt b/src/main/kotlin/biokotlin/gff/GffReader.kt index aaf67613..558cd3b0 100644 --- a/src/main/kotlin/biokotlin/gff/GffReader.kt +++ b/src/main/kotlin/biokotlin/gff/GffReader.kt @@ -5,7 +5,8 @@ import biokotlin.util.bufferedReader import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.* /** - * Parses a GFF and returns a list of top-level features in order of their start + * Parses a GFF and returns a list of top-level features sorted alphabetically by their seqid and then numerically + * by their start * @param gff Path to gff file */ fun parseGff(gff: String): List { @@ -76,7 +77,20 @@ fun parseGff(gff: String): List { } println("Time: ${System.currentTimeMillis() - startTime}ms") - return roots.sortedBy { it.start } .map { it.build() } + + class RootSorter: Comparator { + override fun compare(p0: FeatureBuilder?, p1: FeatureBuilder?): Int { + if (p0 == null || p1 == null) return 0 + return if (p0.seqid == p1.seqid) { + p0.start - p1.start + } else { + p0.seqid.compareTo(p1.seqid) + } + } + + } + + return roots.sortedWith(RootSorter()).map { it.build() } } fun main() { From d6b35fd28be0bcef48535bb6ba9eda4657507f0c Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 17 Jun 2022 16:04:37 -0400 Subject: [PATCH 17/19] Committing prior to branching to reduce the number of clases in the Feature framework --- src/main/kotlin/biokotlin/gff/Feature.kt | 77 ++++---- .../kotlin/biokotlin/gff/FeatureBuilder.kt | 20 ++- src/main/kotlin/biokotlin/gff/GffReader.kt | 107 ------------ src/main/kotlin/biokotlin/gff/GffTree.kt | 165 ++++++++++++++++++ src/main/kotlin/biokotlin/gff/MRNA.kt | 15 ++ 5 files changed, 221 insertions(+), 163 deletions(-) delete mode 100644 src/main/kotlin/biokotlin/gff/GffReader.kt create mode 100644 src/main/kotlin/biokotlin/gff/GffTree.kt diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt index cb1235bc..8f3e6657 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -61,7 +61,8 @@ enum class FeatureType { * by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces * are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. - * @param children + * @param children TODO + * @see FeatureBuilder */ abstract class Feature( val seqid: String, @@ -77,14 +78,15 @@ abstract class Feature( init { attributes = attributes.toMap() - children = children.toList().sortedBy { it.start } + children = children.sortedWith(FeatureComparator()) } abstract fun type(): FeatureType fun attribute(key: String) = attributes[key] - fun parent() = attributes["Parent"] + //TODO make this an actual pointer and handle multiple parents + fun parent(): Feature = TODO() fun id() = attributes["ID"] @@ -107,54 +109,21 @@ abstract class Feature( fun isCircular() = attributes["Is_circular"] /** - * Converts to a DOT file (without header), for graph visualization + * Compares this to [other] alphabetically by seqid, then by start, then by end position. + * Returns zero if this and [other] are equal in ordering, a negative number if this is less + * than [other], or a positive number if this is greater than [other]. */ - fun toDot(): String { - val sb = StringBuilder() - for (child in children) { - sb.append("\"$this\" -> \"$child\"\n") - sb.append(child.toDot()) - } - sb.append("\"$this\"[label = \"$seqid: ${type()} $start-$end\"]") - return sb.toString() - } - - /** - * Compares if this and other have the same String representation, and recurisvely checks that for the children. - * It is possible for two things to be equal even if their children are not in the same order (sometimes non-equal - * children have the same start so they get sorted arbitrarily). - */ - fun lazyEquals(other: Feature): Boolean { - val equalOnThisLevel = seqid == other.seqid && - source == other.source && - start == other.start && - end == other.end && - strand == other.strand && - phase == other.phase && - id() == other.id() && - parent() == other.parent() - if (!equalOnThisLevel) { - return false - } else if (children.isEmpty() && other.children.isEmpty()){ - return true - } - else { - val aggregatedChildren = children.map { it.toString() }.reduce{ acc, string -> acc + string } - val aggregatedOtherChildren = other.children.map { it.toString() }.reduce{ acc, string -> acc + string } - for (i in children.indices) { - if (!aggregatedOtherChildren.contains(children[i].toString())) { - return false - } - } - for (i in other.children.indices) { - if(!aggregatedChildren.contains(other.children[i].toString())) { - return false - } + fun compareTo(other: Feature): Int { + return if (seqid.compareTo(other.seqid) == 0) { + if (start.compareTo(other.start) == 0) { + end.compareTo(other.end) + } else { + start.compareTo(other.start) } + } else { + seqid.compareTo(other.seqid) } - return true } - /** * Returns the feature as a string representing row in a GFF file */ @@ -172,4 +141,18 @@ abstract class Feature( return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" } +} + +/** + * Provides ordering for feature + */ +class FeatureComparator: Comparator { + /** + * Returns the same result as [p0].compareTo([p1]) unless one of the arguments is null, in which case it returns 0. + */ + override fun compare(p0: Feature?, p1: Feature?): Int { + if (p0 == null || p1 == null) return 0 + + return p0.compareTo(p1) + } } \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt index 9abe9bfc..856081da 100644 --- a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt +++ b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt @@ -1,5 +1,10 @@ package biokotlin.gff +//TODO add pointer to parent(s) +/** + * A mutable representation of a genetic feature that can be built into a [Feature]. + * @see Feature + */ class FeatureBuilder( val seqid: String, val source: String, @@ -12,10 +17,6 @@ class FeatureBuilder( var attributes: Map = emptyMap() ) { - init { - id()?.let { builderMap[it] = this } - } - val children = mutableListOf() fun id() = attributes["ID"] @@ -24,6 +25,11 @@ class FeatureBuilder( children.add(child) } + /** + * Builds this feature and its children, recursively. + * @return An immutable [Feature] with the properties of this [FeatureBuilder] and whose children are built + * versions of the this [FeatureBuilder]'s children, sorted by [FeatureComparator]. + */ fun build(): Feature { val children = children.map { it.build() } @@ -41,8 +47,4 @@ class FeatureBuilder( } -} - -private val builderMap = mutableMapOf() - -fun featureBuilder(id: String) = builderMap[id] \ No newline at end of file +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/GffReader.kt b/src/main/kotlin/biokotlin/gff/GffReader.kt deleted file mode 100644 index 558cd3b0..00000000 --- a/src/main/kotlin/biokotlin/gff/GffReader.kt +++ /dev/null @@ -1,107 +0,0 @@ -package biokotlin.gff - -import biokotlin.genome.* -import biokotlin.util.bufferedReader -import org.jetbrains.kotlinx.dataframe.DataRow -import org.jetbrains.kotlinx.dataframe.api.* -/** - * Parses a GFF and returns a list of top-level features sorted alphabetically by their seqid and then numerically - * by their start - * @param gff Path to gff file - */ -fun parseGff(gff: String): List { - val startTime = System.currentTimeMillis() - val file = bufferedReader(gff) - val roots = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } - val orphans = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } - val registry = HashMap(0) //id to feature builder - - var line = file.readLine() - var lineNumber = 0 - while (line != null) { - if (line.startsWith("#")) { - line = file.readLine() - lineNumber++ - continue - } - - val split = line.split("\t") - val attributes = split[8].split(";") - val attributeMap = HashMap(0) - attributes.forEach { - val splitAttribute = it.split("=") - attributeMap[splitAttribute[0]] = splitAttribute[1] - } - val score = if (split[5] == ".") { - Double.NaN - } else { - split[5].toDouble() - } - val featureBuilder = FeatureBuilder( - split[0], - split[1], - FeatureType.fromGffString(split[2]), - split[3].toInt(), - split[4].toInt(), - score, - split[6], - split[7], - attributeMap - ) - - if (featureBuilder.attributes["ID"] != null) { - registry[featureBuilder.attributes["ID"]!!] = featureBuilder - } - - if (featureBuilder.attributes["Parent"] != null) { - if (registry.contains(featureBuilder.attributes["Parent"])) { - registry[featureBuilder.attributes["Parent"]]?.add(featureBuilder) - } else { - orphans.add(featureBuilder) - } - } else { - roots.add(featureBuilder) - } - - line = file.readLine() - lineNumber++ - } - - for (orphan in orphans) { - if (registry.contains(orphan.attributes["Parent"])) { - registry[orphan.attributes["Parent"]]?.add(orphan) - } else { - roots.add(orphan) - println("Warning: Orphaned element. Parent ${orphan.attributes["Parent"]} is not in the file") - } - } - - println("Time: ${System.currentTimeMillis() - startTime}ms") - - class RootSorter: Comparator { - override fun compare(p0: FeatureBuilder?, p1: FeatureBuilder?): Int { - if (p0 == null || p1 == null) return 0 - return if (p0.seqid == p1.seqid) { - p0.start - p1.start - } else { - p0.seqid.compareTo(p1.seqid) - } - } - - } - - return roots.sortedWith(RootSorter()).map { it.build() } -} - -fun main() { - val roots = parseGff("/home/jeff/Buckler/Biokotlin/b73.gff") - val shuffledRoots = parseGff("/home/jeff/Buckler/Biokotlin/b73_shuffled.gff") - - var errors = 0 - for (i in 0 until roots.size) { - if (!roots[i].lazyEquals(shuffledRoots[i])) { - errors++ - } - } - println("Number of errors $errors") -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/GffTree.kt b/src/main/kotlin/biokotlin/gff/GffTree.kt new file mode 100644 index 00000000..e9492971 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/GffTree.kt @@ -0,0 +1,165 @@ +package biokotlin.gff + +import biokotlin.util.bufferedReader + +/** + * A tree representation of a GFF file, with parent/child relationships in the tree created from the Parent attribute + * of elements in the GFF file. + * @param gff Path to the GFF file + */ +class GffTree(val gff: String): Iterable { + /** + * The top-level features of the gff, sorted by [biokotlin.gff.FeatureComparator] + */ + val roots = parseRoots(gff) + + /** + * A map of all IDs to the feature with that ID. Features without IDs are excluded. + * TODO This shouldn't be a list + */ + val idMap: Map by lazy { + getMap ( {it.id()} ) + } + + //TODO add more map presets + + /** + * Iterates over the GffTree. Starts at the first root (as defined by the order in [biokotlin.gff.FeatureComparator]), + * performs a depth-first traversal, then moves to the next root. + */ + override fun iterator(): Iterator { + return toList().iterator() + } + + /** + * Converts the tree representation to a list. The first element is the first root (as defined by the order in + * [biokotlin.gff.FeatureComparator]), then a depth-first ordering of its children, then the next root and a + * depth-first ordering of its children, etc. + */ + fun toList(): List { + TODO("Not yet implemented") + } + + /** + * Returns a map where each feature in the [GffTree] is added to a list with the key `mapping(feature)`. + * The features in each list will be ordered such that earlier elements in [iterator] are prior to later elements + * in [iterator]. When [mapping] returns `null`, no key-value pair is added to the map. The insertion order of + * the map follows the order of [iterator]. + * + * This can be useful for accessing features by their non-standard attributes + * ```kotlin + * val gffTree = GffTree("/home/user/b73.gff3") + * //Map the features by the non-standard protein_id attribute + * val proteinIDMap = gffTree.getMap { it.attributes["protein_id"] } + * //Access all features that have "Zm00001eb000020_P004" in their "protein_id" attribute + * val features = proteinIDMap["Zm00001eb000020_P004"] + * ``` + * + * Combined with custom data classes, this method also allows for convenient access of all features + * that have a particular combination of properties. + * + * Let's say you have a GFF where exons do not have unique IDs, but they can be uniquely defined by their parent's + * ID and their rank. + * ```kotlin + * data class parentAndRank(val parent: String, val rank: String) + * val gffTree = GffTree("/home/user/myGff.gff3") + * val parentAndRankMap = gffTree.getMap { feature -> + * val parent = feature.attributes["Parent"] + * val rank = feature.attributes["rank"] + * //Do not create mappings for anything that does not have a parent or a rank + * if (parent == null || rank == null) null + * else parentAndRank(parent, rank) + * } + * //Pull out the third exon on transcript Zm00001eb442870_T001 + * val exon = parentAndRankMap[parentAndRank("Zm00001eb442870_T001", "3")] + * ``` + * + * @see idMap + * @param mapping The function that produces keys for a given feature + */ + fun getMap(mapping: (Feature) -> T?): Map> { + TODO("Not yet implemented") + } + +} + +fun main() { + val gffTree = GffTree("/home/user/myGff.gff3") + val proteinIDs = gffTree.getMap { it.attributes["protein_id"] } + val protein = proteinIDs["Zm00001eb442910_P001"] // List of CDS regions + val cds = protein?.get(0) + val transcript = cds?.parent() + val gene = transcript?.parent() + +/** + * Parses a GFF and returns a list of top-level features sorted by [biokotlin.gff.FeatureComparator] + * @param gff Path to gff file + */ +private fun parseRoots(gff: String): List { + val file = bufferedReader(gff) + val roots = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val orphans = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val registry = HashMap(0) //id to feature builder + + var line = file.readLine() + var lineNumber = 0 + while (line != null) { + if (line.startsWith("#")) { + line = file.readLine() + lineNumber++ + continue + } + + val split = line.split("\t") + val attributes = split[8].split(";") + val attributeMap = HashMap(0) + attributes.forEach { + val splitAttribute = it.split("=") + attributeMap[splitAttribute[0]] = splitAttribute[1] + } + val score = if (split[5] == ".") { + Double.NaN + } else { + split[5].toDouble() + } + val featureBuilder = FeatureBuilder( + split[0], + split[1], + FeatureType.fromGffString(split[2]), + split[3].toInt(), + split[4].toInt(), + score, + split[6], + split[7], + attributeMap + ) + + if (featureBuilder.attributes["ID"] != null) { + registry[featureBuilder.attributes["ID"]!!] = featureBuilder + } + + if (featureBuilder.attributes["Parent"] != null) { + if (registry.contains(featureBuilder.attributes["Parent"])) { + registry[featureBuilder.attributes["Parent"]]?.add(featureBuilder) + } else { + orphans.add(featureBuilder) + } + } else { + roots.add(featureBuilder) + } + + line = file.readLine() + lineNumber++ + } + + for (orphan in orphans) { + if (registry.contains(orphan.attributes["Parent"])) { + registry[orphan.attributes["Parent"]]?.add(orphan) + } else { + roots.add(orphan) + println("Warning: Orphaned element. Parent ${orphan.attributes["Parent"]} is not in the file") + } + } + + return roots.map { it.build() }.sortedWith(FeatureComparator()) +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt index 5a27d2ed..4960c755 100644 --- a/src/main/kotlin/biokotlin/gff/MRNA.kt +++ b/src/main/kotlin/biokotlin/gff/MRNA.kt @@ -17,22 +17,37 @@ class MRNA( override fun type(): FeatureType = FeatureType.mRNA + /** + * @return A list of direct children that are an instance of [Coding], in order as defined by [FeatureComparator] + */ fun coding(): List { return children.filterIsInstance().toList() } + /** + * @return A list of direct children that are an instance of [Exon], in order as defined by [FeatureComparator] + */ fun exons(): List { return children.filterIsInstance().toList() } + /** + * @return A list of direct children that are an instance of [Leader], in order as defined by [FeatureComparator] + */ fun leaders(): List { return children.filterIsInstance().toList() } + /** + * @return A list of direct children that are an instance of [Terminator], in order as defined by [FeatureComparator] + */ fun terminators(): List { return children.filterIsInstance().toList() } + /** + * @return Introns in direct children + */ fun introns(): List { return children .chunked(2) From 33218f58250fca94543d61ab4f4f4f2986a87900 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Thu, 23 Jun 2022 16:05:37 -0400 Subject: [PATCH 18/19] First draft of unified Feature/FeatureTree framework --- .../biokotlin/{gff => featureTree}/Feature.kt | 200 ++++++- .../biokotlin/featureTree/FeatureBuilder.kt | 77 +++ .../biokotlin/featureTree/FeatureTree.kt | 547 ++++++++++++++++++ .../MakeGffForNewSpecies.kt | 2 +- src/main/kotlin/biokotlin/gff/Chromosome.kt | 14 - src/main/kotlin/biokotlin/gff/Coding.kt | 20 - src/main/kotlin/biokotlin/gff/Exon.kt | 29 - .../kotlin/biokotlin/gff/FeatureBuilder.kt | 50 -- src/main/kotlin/biokotlin/gff/Gene.kt | 22 - src/main/kotlin/biokotlin/gff/GffTree.kt | 165 ------ src/main/kotlin/biokotlin/gff/Intron.kt | 12 - src/main/kotlin/biokotlin/gff/Leader.kt | 20 - src/main/kotlin/biokotlin/gff/MRNA.kt | 64 -- src/main/kotlin/biokotlin/gff/Scaffold.kt | 14 - src/main/kotlin/biokotlin/gff/Terminator.kt | 20 - 15 files changed, 799 insertions(+), 457 deletions(-) rename src/main/kotlin/biokotlin/{gff => featureTree}/Feature.kt (51%) create mode 100644 src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt create mode 100644 src/main/kotlin/biokotlin/featureTree/FeatureTree.kt rename src/main/kotlin/biokotlin/{gff => featureTree}/MakeGffForNewSpecies.kt (99%) delete mode 100644 src/main/kotlin/biokotlin/gff/Chromosome.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Coding.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Exon.kt delete mode 100644 src/main/kotlin/biokotlin/gff/FeatureBuilder.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Gene.kt delete mode 100644 src/main/kotlin/biokotlin/gff/GffTree.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Intron.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Leader.kt delete mode 100644 src/main/kotlin/biokotlin/gff/MRNA.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Scaffold.kt delete mode 100644 src/main/kotlin/biokotlin/gff/Terminator.kt diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/featureTree/Feature.kt similarity index 51% rename from src/main/kotlin/biokotlin/gff/Feature.kt rename to src/main/kotlin/biokotlin/featureTree/Feature.kt index 8f3e6657..46398665 100644 --- a/src/main/kotlin/biokotlin/gff/Feature.kt +++ b/src/main/kotlin/biokotlin/featureTree/Feature.kt @@ -1,7 +1,35 @@ -package biokotlin.gff +package biokotlin.featureTree +import kotlin.jvm.Throws + +/** + * @see Feature + */ enum class FeatureType { - Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome, Scaffold; + Gene, + Exon, + /** + * AKA 5' UTR + */ + Leader, + + /** + * AKA 3' UTR + */ + Terminator, + + /** + * AKA CDS + */ + Coding, + + /** + * AKA transcript + */ + mRNA, + Intron, + Chromosome, + Scaffold; companion object { /** @@ -32,6 +60,7 @@ enum class FeatureType { * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type * creating a new composite type that is a subclass of the type in the type column. + * @param type TODO * @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to @@ -42,7 +71,8 @@ enum class FeatureType { * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. * @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics * of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, - * and that P-values be used for ab initio gene prediction features. + * and that P-values be used for ab initio gene prediction features. Represented by Double.NaN when no score exists + * for a particular feature. * @param strand The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, * and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant, * but unknown. @@ -60,45 +90,79 @@ enum class FeatureType { * @param attributes A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated * by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces * are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be - * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. + * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. When there + * are multiple values for the same tag, they are separated by a comma. * @param children TODO * @see FeatureBuilder */ -abstract class Feature( +class Feature( val seqid: String, val source: String, + val type: FeatureType, val start: Int, val end: Int, val score: Double = Double.NaN, val strand: String = "+", val phase: String = ".", - var attributes: Map = emptyMap(), - var children: List = emptyList() -) { + val attributes: Map> = emptyMap(), + children: List = emptyList(), +): FeatureTree(children) { + + private val parents = mutableListOf() + + /** + * Adds a parent. ONLY FOR USE IN BUILDERS! + */ + internal fun addParent(parent: FeatureTree) { + parents.add(parent) + } - init { - attributes = attributes.toMap() - children = children.sortedWith(FeatureComparator()) + /** + * @return one of the parents of this [Feature] that is an instance of [Feature] or null if no such parent exists. + * This function is to provide convenience for features that are known to only have one parent. To prevent excessive + * casts, this function cannot access a top-level [FeatureTree] parent that is not an instance of [Feature]. To + * access this top level container, use [root]. + * @see root + * @see parents + */ + fun parent(): Feature? { + return parents.find{ it is Feature } as? Feature } - abstract fun type(): FeatureType + /** + * @return The parents of this [Feature] that are instances of [Feature] or an empty list if no such parents + * exist. To access a top-level [FeatureTree] that is not a [Feature], use [root]. + * @see root + * @see parent + */ + fun parents(): List { + return parents.filterIsInstance().sortedWith(FeatureComparator()) + } - fun attribute(key: String) = attributes[key] + /** + * @return the top-level container that is an ancestor of this [Feature]. + */ + fun root(): FeatureTree { + for (feature in ancestors()) { + if (feature.parents.isEmpty()) return this + if (feature.parents[0] !is Feature) return feature.parents[0] + } + throw Exception("Malformed FeatureTree. Could not find root.") + } - //TODO make this an actual pointer and handle multiple parents - fun parent(): Feature = TODO() + //Some attributes cannot have multiple values, so the .first() calls allows for more convenience - fun id() = attributes["ID"] + fun id() = attributes["ID"]?.first() - fun name() = attributes["Name"] + fun name() = attributes["Name"]?.first() fun alias() = attributes["Alias"] - fun target() = attributes["Target"] + fun target() = attributes["Target"]?.first() - fun gap() = attributes["Gap"] + fun gap() = attributes["Gap"]?.first() - fun derivesFrom() = attributes["Derives_from"] + fun derivesFrom() = attributes["Derives_from"]?.first() fun note() = attributes["Note"] @@ -106,7 +170,7 @@ abstract class Feature( fun ontologyTerm() = attributes["Ontology_term"] - fun isCircular() = attributes["Is_circular"] + fun isCircular() = attributes["Is_circular"]?.first() /** * Compares this to [other] alphabetically by seqid, then by start, then by end position. @@ -124,8 +188,9 @@ abstract class Feature( seqid.compareTo(other.seqid) } } + /** - * Returns the feature as a string representing row in a GFF file + * @return The feature as a string representing row in a GFF file. */ override fun toString(): String { val scoreString = if (score.isNaN()) { @@ -135,12 +200,85 @@ abstract class Feature( } val attributesString = StringBuilder() - for ((tag, value) in attributes) { - attributesString.append("$tag=$value;") + for ((tag, set) in attributes) { + attributesString.append(tag).append("=") + for (value in set) { + attributesString.append(value).append(",") + } } + //TODO naively doing type will not line up properly with the gff name + return "$seqid\t$source\t${type}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" + } - return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" + /** + * @return a list containing this [Feature] and ancestors of this feature. The ordering is depth-first, + * left-to-right. Ancestors that are not instances of [Feature] are ignored. + */ + fun ancestors(): List { + val ancestors = mutableListOf() + ancestors.add(this) + for (parent in parents()) { + println(parent) + ancestors.addAll(parent.ancestors()) + } + return ancestors } + + /** + * @return the first exon in this [Feature]'s ancestors, or null if there are none. + */ + fun exon(): Feature? { + return ancestors().find { it.type == FeatureType.Exon } + } + + /** + * @return the first gene in this [Feature]'s ancestors, or null if there are none. + */ + fun gene(): Feature? { + return ancestors().find { it.type == FeatureType.Gene } + } + + /** + * @return the first transcript in this [Feature]'s ancestors, or null if there are none. + */ + fun transcript(): Feature? { + return ancestors().find { it.type == FeatureType.mRNA } + } + + /** + * @return the first scaffold in this [Feature]'s ancestors, or null if there are none. + */ + fun scaffold(): Feature? { + return ancestors().find { it.type == FeatureType.Scaffold } + } + + /** + * @return the first chromosome in this [Feature]'s ancestors, or null if there are none. + */ + fun chromosome(): Feature? { + return ancestors().find { it.type == FeatureType.Chromosome } + } + + /** + * @return the first contig in this [Feature]'s ancestors, or null if there are none. + */ + fun contig(): Feature? { + return ancestors().find { it.type == FeatureType.Scaffold || it.type == FeatureType.Chromosome } + } + + override fun nonRecursiveDot(): java.lang.StringBuilder { + val sb = StringBuilder() + if (id() != null) sb.append("\"${hashCode()}\" [label=\"${id()}\\n$type $start-$end\" color = ${typeToColor(type)}]\n") + else sb.append("\"${hashCode()}\" [label=\"$type $start-$end\" color = ${typeToColor(type)}]\n") + for (child in children) sb.append("${hashCode()} -> ${child.hashCode()}\n") + for (parent in parents) sb.append("${hashCode()} -> ${parent.hashCode()}\n") + return sb + } + + private fun typeToColor(type: FeatureType): Int { + return type.ordinal % 11 + 1 + } + } /** @@ -152,7 +290,17 @@ class FeatureComparator: Comparator { */ override fun compare(p0: Feature?, p1: Feature?): Int { if (p0 == null || p1 == null) return 0 - return p0.compareTo(p1) } +} + +fun invariantBroken(culprit: Any, number: Int) { + println("=========================================================================") + println("Invariant $number broken for class ${culprit::class}. culprit.toString():") + println(culprit.toString()) +} + +fun main() { + val tree = FeatureTree.fromGff("/home/jeff/Buckler/Biokotlin/biokotlin/src/test/resources/biokotlin/featureTree/b73_shortened.gff") + println(tree.toDot()) } \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt new file mode 100644 index 00000000..e6eea51c --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt @@ -0,0 +1,77 @@ +package biokotlin.featureTree + +open class FeatureTreeBuilder { + protected val children = mutableListOf() + protected val builtChildren = mutableListOf() + + fun addChild(child: FeatureBuilder) { + children.add(child) + child.addParent(this) + } + + internal fun addBuiltChild(child: Feature) { + builtChildren.add(child) + } + + open fun build(): FeatureTree { + val built = FeatureTree(children.map { it.build() }) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + return built + } +} + +/** + * A mutable representation of a genetic feature that can be built into a [Feature]. + * @see Feature + */ +class FeatureBuilder( + val seqid: String, + val source: String, + val type: FeatureType, + val start: Int, + val end: Int, + val score: Double = 0.0, + val strand: String = "+", + val phase: String = ".", + var attributes: Map> = emptyMap() //TODO make Set to account for multiple values in the same attribute +): FeatureTreeBuilder() { + + private val parents = mutableListOf() + + fun addParent(parent: FeatureTreeBuilder) { + parents.add(parent) + } + + fun id() = attributes["ID"] + + /** + * @return an immutable tree representation of this [Feature]. To simultaneously build multiple top-level + * elements into the same tree, use [buildFromList] + */ + override fun build(): Feature { + val children = children.map { it.build() } + val built = Feature(seqid, source, type, start, end, score, strand, phase, attributes, children) + for (parent in parents) parent.addBuiltChild(built) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + return built + } + + companion object { + /** + * @param list the top-level elements to be built. + * @return a built version of the trees represented by [list], combined into a single tree. + */ + fun buildFromList(list: List): FeatureTree { + if (list.isEmpty()) return FeatureTree(emptyList()) + if (list.size == 1) return list[0].build() + else { + val tree = FeatureTreeBuilder() + for (child in list) tree.addChild(child) + return tree.build() + } + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt new file mode 100644 index 00000000..7df27f1e --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt @@ -0,0 +1,547 @@ +package biokotlin.featureTree + +import biokotlin.util.bufferedReader +import htsjdk.samtools.util.SequenceUtil.a + +/** + * The root of a tree of GFF Features. Contains useful methods for parsing down a tree of such features. + * A [Feature] is a subclass of [FeatureTree] with additional information that describes the properties of the + * [Feature]. Note that instances of [FeatureTree] that are not instances of [Feature] must be top-level roots and are + * ___excluded___ from most operations on the tree, including the iterator. This is because these roots are simply + * artificial containers that do not hold meaningful information from the GFF file. + * @see Feature + */ +open class FeatureTree (children: List): Iterable { + val children = children.sortedWith(FeatureComparator()) + + companion object { + /** + * @param gff a pathname of a gff file. + * @return a tree representing the features of [gff] + */ + fun fromGff(gff: String): FeatureTree { + + val file = bufferedReader(gff) + val registry = HashMap(0) //id to feature builder + val roots = mutableListOf() + + var line = file.readLine() + while (line != null) { + if (line.startsWith("#")) { + line = file.readLine() + continue + } + + val split = line.split("\t") + val attributes = split[8].split(";") + val attributeMap = HashMap>(0) + attributes.forEach { + val tagValue = it.split("=") + val tag = tagValue[0] + val values = tagValue[1].split(",") + attributeMap[tag] = values.toSet() + } + val score = split[5].toDoubleOrNull() ?: Double.NaN + + val featureBuilder = FeatureBuilder( + split[0], + split[1], + FeatureType.fromGffString(split[2]), + split[3].toInt(), + split[4].toInt(), + score, + split[6], + split[7], + attributeMap + ) + + if (featureBuilder.attributes["ID"] != null) { + registry[featureBuilder.attributes["ID"]!!.first()] = featureBuilder + } + + if (featureBuilder.attributes["Parent"] != null) { + val parents = featureBuilder.attributes["Parent"]!! + for (parent in parents) { + if (registry.contains(parent)) { + registry[parent]?.addChild(featureBuilder) + } else { + TODO("Please use a well-formed GFF (meaning parents occur before all their children)") + } + } + } else { + roots.add(featureBuilder) + } + + line = file.readLine() + } + + return FeatureBuilder.buildFromList(roots) + } + } + + /** + * Iterates over the tree rooted at this [FeatureTree] in depth-first order. Elements that are not instances + * of [Feature] are ignored. + */ + override fun iterator(): Iterator { + return toList().iterator() + } + + /** + * A map of all IDs of features in this [FeatureTree] to the feature with that ID. Features without IDs are excluded. + */ + val byID: Map by lazy { associateBy { it.id() } } + + /** + * A map that groups features in this [FeatureTree] by seqid. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val bySeqid: Map> by lazy { groupBy {it.seqid} } + + /** + * A map that groups features in this [FeatureTree] by source. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val bySource: Map> by lazy { groupBy {it.source} } + + /** + * A map that groups features in this [FeatureTree] by type. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byType: Map> by lazy { groupBy {it.type} } + + /** + * A map that groups features in this [FeatureTree] by start position. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byStart: Map> by lazy { groupBy {it.start} } + + /** + * A map that groups features in this [FeatureTree] by end position. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byEnd: Map> by lazy { groupBy {it.end} } + + /** + * A map that groups features in this [FeatureTree] by score. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byScore: Map> by lazy { groupBy {it.score} } + + /** + * A map that groups features in this [FeatureTree] by strand. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byStrand: Map> by lazy { groupBy {it.strand} } + + /** + * A map that groups features in this [FeatureTree] by phase. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byPhase: Map> by lazy { groupBy {it.phase} } + + /** + * A map that groups features in this [FeatureTree] by name. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byName: Map> by lazy { groupBy {it.name()} } + + /** + * A map that groups features in this [FeatureTree] by their alias(es). Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byAlias: Map, List> by lazy { groupBy {it.alias()} } + + /** + * A map that groups features in this [FeatureTree] by their parent ID(s). Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byParentID: Map, List> by lazy { groupBy {it.attributes["Parent"]} } + + /** + * A map that groups features in this [FeatureTree] by their parent features (sorted by [FeatureComparator]). + * Order within the value lists preserves the order in the iterator of this [FeatureTree]. + */ + val byParent: Map, List> by lazy { groupBy {it.parents()} } + + /** + * A map that groups features in this [FeatureTree] by their target. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byTarget: Map> by lazy { groupBy {it.target()} } + + /** + * A map that groups features in this [FeatureTree] by their Derives_from attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byDerivesFrom: Map> by lazy { groupBy {it.derivesFrom()} } + + /** + * A map that groups features in this [FeatureTree] by their note. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byNote: Map, List> by lazy { groupBy {it.note()} } + + /** + * A map that groups features in this [FeatureTree] by their Dbxref attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byDbxref: Map, List> by lazy { groupBy {it.dbxref()} } + + /** + * A map that groups features in this [FeatureTree] by their Ontology_term attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byOntologyTerm: Map, List> by lazy { groupBy {it.ontologyTerm()} } + + /** + * A map that groups features in this [FeatureTree] by their Is_circular attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byIsCircular: Map> by lazy { groupBy {it.isCircular()} } + + /** + * A map that groups features in this [FeatureTree] by their Gap attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byGap: Map> by lazy { groupBy {it.gap()} } + + val size = toList().size + + val list: List by lazy { + val list = mutableListOf() + if (this is Feature) list.add(this) + + for (child in children) { + list.addAll(child.toList()) + } + list + } + + /** + * Represents this FeatureTree as a GFF. + */ + override fun toString(): String { + return recursiveToString() + } + + /** + * Traverses this [FeatureTree] in depth-first order and appends to the [toString] of each [Feature]. + */ + fun recursiveToString(): String { + val sb = StringBuilder() + for (feature in this) { + sb.append(feature) + } + return sb.toString() + } + + /** + * Converts the [FeatureTree] representation to a list in depth-first order. Elements that are not instances of + * [Feature] are ignored. + */ + fun toList(): List { + return list + } + + /** + * Groups elements of the original array by the key returned by the given [keySelector] function applied to each + * element and returns a map where each group key is associated with a list of corresponding elements. + * The returned map preserves the entry iteration order of the keys produced from the original [FeatureTree]. + * + * TODO code samples + * @param mapping The function that produces keys for a given feature + */ + fun groupBy(keySelector: (Feature) -> T?): Map> { + return groupBy(keySelector) { it } + } + + /** + * Groups values returned by the valueTransform function applied to each element of the original collection by the + * key returned by the given [keySelector] function applied to the element and returns a map where each group key is + * associated with a list of corresponding values. + * + * The returned map preserves the entry iteration order of the keys produced from the original [FeatureTree]. + * + * TODO code samples + */ + fun groupBy(keySelector: (Feature) -> T?, valueTransform: (Feature) -> V): Map> { + val map = mutableMapOf>() + for (feature in this) { + val key = keySelector(feature) ?: continue + val value = valueTransform(feature) + if (map.contains(key)) map[key]?.add(value) + else map[key] = mutableListOf(value) + } + return map + } + + /** + * Returns a Map containing key-value pairs provided by transform function applied to the [Feature]s of this + * [FeatureTree]. + * If any of two pairs would have the same key the last one gets added to the map. + * If [transform] returns null, no entry is added for that feature. + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associate(transform: (Feature) -> Pair?): Map { + val map = mutableMapOf() + for (feature in this) { + val pair = transform(feature) ?: continue + map[pair.first] = pair.second + } + return map + } + + /** + * Returns a Map containing the [Feature]s from this [FeatureTree] indexed by the key returned from [keySelector] + * function applied to each element. + * If any two elements would have the same key returned by keySelector the last one gets added to the map. + * + * If [keySelector] returns null, no entry is added for that feature. + * + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateBy(keySelector: (Feature) -> T?): Map { + return associateBy(keySelector) { it } + } + + /** + * Returns a Map containing the values provided by [valueTransform] and indexed by keySelector functions applied to + * elements of the given collection. If any two elements would have the same key returned by keySelector the last + * one gets added to the map. + * + * If [keySelector] returns null, no entry is added for that feature. + * + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateBy(keySelector: (Feature) -> T?, valueTransform: (Feature) -> V): Map { + return associate { + val key = keySelector(it) + if (key == null) null + else Pair(key, valueTransform(it)) + } + } + + /** + * Returns a Map where keys are elements from the given array and values are produced by the valueSelector + * function applied to each element. + * If any two elements are equal, the last one gets added to the map. + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateWith(valueSelector: (Feature) -> T): Map { + return associate { Pair(it, valueSelector(it)) } + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains [feature]. False otherwise. + */ + fun contains(feature: Feature): Boolean { + return toList().contains(feature) + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains all elements in [elements]. False otherwise. + */ + fun containsAll(elements: Collection): Boolean { + return toList().containsAll(elements) + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains no elements that are instances of [Feature]. + * False otherwise. + */ + fun isEmpty(): Boolean { + return toList().isEmpty() + } + + /** + * @return True if all elements in the tree rooted at this [FeatureTree] match the [predicate]. False otherwise. + */ + fun all(predicate: (Feature) -> Boolean): Boolean { + return toList().all(predicate) + } + + /** + * @return True if at least one element in the tree rooted at this [FeatureTree] matches the [predicate]. False + * otherwise. + */ + fun any(predicate: (Feature) -> Boolean): Boolean { + return toList().any(predicate) + } + + /** + * @return The number of elements in the tree rooted at this [FeatureTree] matching the [predicate] + */ + fun count(predicate: (Feature) -> Boolean): Int { + return toList().count(predicate) + } + + /** + * @return A list containing only elements of the tree rooted at this [FeatureTree] that match the given [predicate]. + * The order of the iterator of the original [FeatureTree] is preserved. + */ + fun filteredList(predicate: (Feature) -> Boolean): List { + return toList().filter(predicate) + } + + /** + * @return The first element in the tree rooted at this [FeatureTree] matching the given predicate, or null if no + * such element was found. + */ + fun find(predicate: (Feature) -> Boolean): Feature? { + return toList().find(predicate) + } + + /** + * @return The first element in the tree rooted at this [FeatureTree] matching the given predicate, or null if no + * such element was found. + */ + fun findLast(predicate: (Feature) -> Boolean): Feature? { + return toList().findLast(predicate) + } + + /** + * Performs the given operation on each element in the tree rooted at this [FeatureTree]. + */ + fun forEach(action: (Feature) -> Unit) { + toList().forEach(action) + } + + /** + * Accumulates value starting with [initial] value and applying [operation] from left to right to current accumulator + * value and each element. + * + * Returns the specified initial value if the tree rooted at [FeatureTree] is empty. + */ + fun fold(initial: R, operation: (acc: R, Feature) -> R): R { + return toList().fold(initial, operation) + } + + /** + * Returns the sum of all values produced by [selector] function applied to each element in the tree rooted at + * this [FeatureTree]. + */ + fun sumOf(selector: (Feature) -> Int): Int { + return toList().map(selector).fold(0) { acc, e -> acc + e } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds or chromosomes + */ + fun contigs(): List { + return filteredList { it.type == FeatureType.Scaffold || it.type == FeatureType.Chromosome } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds + */ + fun scaffolds(): List { + return filteredList { it.type == FeatureType.Scaffold } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are chromosomes + */ + fun chromosomes(): List { + return filteredList { it.type == FeatureType.Chromosome } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are genes + */ + fun genes(): List { + return filteredList { it.type == FeatureType.Gene } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are transcripts + */ + fun transcripts(): List { + return filteredList { it.type == FeatureType.mRNA } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are exons + */ + fun exons(): List { + return filteredList { it.type == FeatureType.Exon } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are introns + */ + fun introns(): List { + return filteredList { it.type == FeatureType.Intron } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are CDS + */ + fun codingSequences(): List { + return filteredList { it.type == FeatureType.Coding } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are leaders + */ + fun leaders(): List { + return filteredList { it.type == FeatureType.Leader } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are terminators + */ + fun terminators(): List { + return filteredList { it.type == FeatureType.Terminator } + } + + /** + * @return A Map containing the [Feature]s from this [FeatureTree] indexed by their value for [attribute]. + */ + fun associateByAttribute(attribute: String): Map, Feature> { + return associateBy { it.attributes[attribute] } + } + + fun groupByAttribute(attribute: String): Map, List> { + return groupBy { it.attributes[attribute] } + } + + /** + * @return a list of features with [seqid] and within the [start] and [end] range (inclusive). The order + * of the iterator for this [FeatureTree] is preserved. + */ + fun within(seqid: String, start: Int, end: Int): List { + //Efficiency of this can be improved because it can give up once the start locations + //become higher than end or when it becomes a different seqid + return filteredList { it.seqid == seqid && it.start >= start && it.end <= end } + } + + /** + * @return this [FeatureTree] as a DOT file, for visualization + */ + fun toDot(): String { + val sb = StringBuilder() + sb.append("digraph {\n") + sb.append("rank = source\n") + sb.append("ordering = out\n") + sb.append("node[shape = box style = filled colorscheme = set311]\n") + + if (this !is Feature) sb.append(nonRecursiveDot()) + + for (child in this) sb.append(child.nonRecursiveDot()) + sb.append("}") + return sb.toString() + } + + /** + * @return this [FeatureTree]'s label, as well as its relationship to its direct children and direct + * parents + */ + internal open fun nonRecursiveDot(): java.lang.StringBuilder { + val sb = StringBuilder() + sb.append("\"${hashCode()}\" [label=\"container\" color = gray]\n") + for (child in children) sb.append("${hashCode()} -> ${child.hashCode()}\n") + return sb + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt similarity index 99% rename from src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt rename to src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt index c92ce53a..22f58ea4 100644 --- a/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt +++ b/src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt @@ -1,4 +1,4 @@ -package biokotlin.gff +package biokotlin.featureTree import biokotlin.util.bufferedWriter import htsjdk.samtools.CigarOperator diff --git a/src/main/kotlin/biokotlin/gff/Chromosome.kt b/src/main/kotlin/biokotlin/gff/Chromosome.kt deleted file mode 100644 index 18af4181..00000000 --- a/src/main/kotlin/biokotlin/gff/Chromosome.kt +++ /dev/null @@ -1,14 +0,0 @@ -package biokotlin.gff - -class Chromosome ( - seqid: String, - source: String, - start: Int, - end: Int, - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, attributes = attributes, children = children) { - - override fun type(): FeatureType = FeatureType.Chromosome - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Coding.kt b/src/main/kotlin/biokotlin/gff/Coding.kt deleted file mode 100644 index 21d692c0..00000000 --- a/src/main/kotlin/biokotlin/gff/Coding.kt +++ /dev/null @@ -1,20 +0,0 @@ -package biokotlin.gff - -/** - * Also known as CDS - */ -class Coding( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.Coding - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Exon.kt b/src/main/kotlin/biokotlin/gff/Exon.kt deleted file mode 100644 index 65e9d1f4..00000000 --- a/src/main/kotlin/biokotlin/gff/Exon.kt +++ /dev/null @@ -1,29 +0,0 @@ -package biokotlin.gff - -class Exon( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.Exon - - fun coding(): List { - return children.filterIsInstance().toList() - } - - fun leaders(): List { - return children.filterIsInstance().toList() - } - - fun terminators(): List { - return children.filterIsInstance().toList() - } - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt deleted file mode 100644 index 856081da..00000000 --- a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt +++ /dev/null @@ -1,50 +0,0 @@ -package biokotlin.gff - -//TODO add pointer to parent(s) -/** - * A mutable representation of a genetic feature that can be built into a [Feature]. - * @see Feature - */ -class FeatureBuilder( - val seqid: String, - val source: String, - val type: FeatureType, - val start: Int, - val end: Int, - val score: Double = 0.0, - val strand: String = "+", - val phase: String = ".", - var attributes: Map = emptyMap() -) { - - val children = mutableListOf() - - fun id() = attributes["ID"] - - fun add(child: FeatureBuilder) { - children.add(child) - } - - /** - * Builds this feature and its children, recursively. - * @return An immutable [Feature] with the properties of this [FeatureBuilder] and whose children are built - * versions of the this [FeatureBuilder]'s children, sorted by [FeatureComparator]. - */ - fun build(): Feature { - - val children = children.map { it.build() } - return when (type) { - FeatureType.Gene -> Gene(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.Exon -> Exon(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.Leader -> Leader(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.Terminator -> Terminator(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.Coding -> Coding(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.mRNA -> MRNA(seqid, source, start, end, score, strand, phase, attributes, children) - FeatureType.Intron -> Intron(seqid, source, start, end) - FeatureType.Chromosome -> Chromosome(seqid, source, start, end, attributes, children) - FeatureType.Scaffold -> Scaffold(seqid, source, start, end, attributes, children) - } - - } - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Gene.kt b/src/main/kotlin/biokotlin/gff/Gene.kt deleted file mode 100644 index 3f945339..00000000 --- a/src/main/kotlin/biokotlin/gff/Gene.kt +++ /dev/null @@ -1,22 +0,0 @@ -package biokotlin.gff - - -class Gene( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.Gene - - fun mRNAs(): List { - return children.filterIsInstance().toList() - } - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/GffTree.kt b/src/main/kotlin/biokotlin/gff/GffTree.kt deleted file mode 100644 index e9492971..00000000 --- a/src/main/kotlin/biokotlin/gff/GffTree.kt +++ /dev/null @@ -1,165 +0,0 @@ -package biokotlin.gff - -import biokotlin.util.bufferedReader - -/** - * A tree representation of a GFF file, with parent/child relationships in the tree created from the Parent attribute - * of elements in the GFF file. - * @param gff Path to the GFF file - */ -class GffTree(val gff: String): Iterable { - /** - * The top-level features of the gff, sorted by [biokotlin.gff.FeatureComparator] - */ - val roots = parseRoots(gff) - - /** - * A map of all IDs to the feature with that ID. Features without IDs are excluded. - * TODO This shouldn't be a list - */ - val idMap: Map by lazy { - getMap ( {it.id()} ) - } - - //TODO add more map presets - - /** - * Iterates over the GffTree. Starts at the first root (as defined by the order in [biokotlin.gff.FeatureComparator]), - * performs a depth-first traversal, then moves to the next root. - */ - override fun iterator(): Iterator { - return toList().iterator() - } - - /** - * Converts the tree representation to a list. The first element is the first root (as defined by the order in - * [biokotlin.gff.FeatureComparator]), then a depth-first ordering of its children, then the next root and a - * depth-first ordering of its children, etc. - */ - fun toList(): List { - TODO("Not yet implemented") - } - - /** - * Returns a map where each feature in the [GffTree] is added to a list with the key `mapping(feature)`. - * The features in each list will be ordered such that earlier elements in [iterator] are prior to later elements - * in [iterator]. When [mapping] returns `null`, no key-value pair is added to the map. The insertion order of - * the map follows the order of [iterator]. - * - * This can be useful for accessing features by their non-standard attributes - * ```kotlin - * val gffTree = GffTree("/home/user/b73.gff3") - * //Map the features by the non-standard protein_id attribute - * val proteinIDMap = gffTree.getMap { it.attributes["protein_id"] } - * //Access all features that have "Zm00001eb000020_P004" in their "protein_id" attribute - * val features = proteinIDMap["Zm00001eb000020_P004"] - * ``` - * - * Combined with custom data classes, this method also allows for convenient access of all features - * that have a particular combination of properties. - * - * Let's say you have a GFF where exons do not have unique IDs, but they can be uniquely defined by their parent's - * ID and their rank. - * ```kotlin - * data class parentAndRank(val parent: String, val rank: String) - * val gffTree = GffTree("/home/user/myGff.gff3") - * val parentAndRankMap = gffTree.getMap { feature -> - * val parent = feature.attributes["Parent"] - * val rank = feature.attributes["rank"] - * //Do not create mappings for anything that does not have a parent or a rank - * if (parent == null || rank == null) null - * else parentAndRank(parent, rank) - * } - * //Pull out the third exon on transcript Zm00001eb442870_T001 - * val exon = parentAndRankMap[parentAndRank("Zm00001eb442870_T001", "3")] - * ``` - * - * @see idMap - * @param mapping The function that produces keys for a given feature - */ - fun getMap(mapping: (Feature) -> T?): Map> { - TODO("Not yet implemented") - } - -} - -fun main() { - val gffTree = GffTree("/home/user/myGff.gff3") - val proteinIDs = gffTree.getMap { it.attributes["protein_id"] } - val protein = proteinIDs["Zm00001eb442910_P001"] // List of CDS regions - val cds = protein?.get(0) - val transcript = cds?.parent() - val gene = transcript?.parent() - -/** - * Parses a GFF and returns a list of top-level features sorted by [biokotlin.gff.FeatureComparator] - * @param gff Path to gff file - */ -private fun parseRoots(gff: String): List { - val file = bufferedReader(gff) - val roots = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } - val orphans = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } - val registry = HashMap(0) //id to feature builder - - var line = file.readLine() - var lineNumber = 0 - while (line != null) { - if (line.startsWith("#")) { - line = file.readLine() - lineNumber++ - continue - } - - val split = line.split("\t") - val attributes = split[8].split(";") - val attributeMap = HashMap(0) - attributes.forEach { - val splitAttribute = it.split("=") - attributeMap[splitAttribute[0]] = splitAttribute[1] - } - val score = if (split[5] == ".") { - Double.NaN - } else { - split[5].toDouble() - } - val featureBuilder = FeatureBuilder( - split[0], - split[1], - FeatureType.fromGffString(split[2]), - split[3].toInt(), - split[4].toInt(), - score, - split[6], - split[7], - attributeMap - ) - - if (featureBuilder.attributes["ID"] != null) { - registry[featureBuilder.attributes["ID"]!!] = featureBuilder - } - - if (featureBuilder.attributes["Parent"] != null) { - if (registry.contains(featureBuilder.attributes["Parent"])) { - registry[featureBuilder.attributes["Parent"]]?.add(featureBuilder) - } else { - orphans.add(featureBuilder) - } - } else { - roots.add(featureBuilder) - } - - line = file.readLine() - lineNumber++ - } - - for (orphan in orphans) { - if (registry.contains(orphan.attributes["Parent"])) { - registry[orphan.attributes["Parent"]]?.add(orphan) - } else { - roots.add(orphan) - println("Warning: Orphaned element. Parent ${orphan.attributes["Parent"]} is not in the file") - } - } - - return roots.map { it.build() }.sortedWith(FeatureComparator()) -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Intron.kt b/src/main/kotlin/biokotlin/gff/Intron.kt deleted file mode 100644 index d11a429a..00000000 --- a/src/main/kotlin/biokotlin/gff/Intron.kt +++ /dev/null @@ -1,12 +0,0 @@ -package biokotlin.gff - -class Intron( - seqid: String, - source: String, - start: Int, - end: Int -) : Feature(seqid, source, start, end) { - - override fun type(): FeatureType = FeatureType.Intron - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Leader.kt b/src/main/kotlin/biokotlin/gff/Leader.kt deleted file mode 100644 index c2518a5c..00000000 --- a/src/main/kotlin/biokotlin/gff/Leader.kt +++ /dev/null @@ -1,20 +0,0 @@ -package biokotlin.gff - -/** - * Also known as 5' UTR - */ -class Leader( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.Leader - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt deleted file mode 100644 index 4960c755..00000000 --- a/src/main/kotlin/biokotlin/gff/MRNA.kt +++ /dev/null @@ -1,64 +0,0 @@ -package biokotlin.gff - -/** - * Also known as a Transcript - */ -class MRNA( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.mRNA - - /** - * @return A list of direct children that are an instance of [Coding], in order as defined by [FeatureComparator] - */ - fun coding(): List { - return children.filterIsInstance().toList() - } - - /** - * @return A list of direct children that are an instance of [Exon], in order as defined by [FeatureComparator] - */ - fun exons(): List { - return children.filterIsInstance().toList() - } - - /** - * @return A list of direct children that are an instance of [Leader], in order as defined by [FeatureComparator] - */ - fun leaders(): List { - return children.filterIsInstance().toList() - } - - /** - * @return A list of direct children that are an instance of [Terminator], in order as defined by [FeatureComparator] - */ - fun terminators(): List { - return children.filterIsInstance().toList() - } - - /** - * @return Introns in direct children - */ - fun introns(): List { - return children - .chunked(2) - .filter { it.size == 2 } - .map { - val seqid = it[0].seqid - val source = it[0].source - val start = it[0].end - val end = it[1].start - Intron(seqid, source, start, end) - } - } - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Scaffold.kt b/src/main/kotlin/biokotlin/gff/Scaffold.kt deleted file mode 100644 index 476f5369..00000000 --- a/src/main/kotlin/biokotlin/gff/Scaffold.kt +++ /dev/null @@ -1,14 +0,0 @@ -package biokotlin.gff - -class Scaffold( - seqid: String, - source: String, - start: Int, - end: Int, - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, attributes = attributes, children = children) { - - override fun type(): FeatureType = FeatureType.Scaffold - -} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Terminator.kt b/src/main/kotlin/biokotlin/gff/Terminator.kt deleted file mode 100644 index 9b6e5a12..00000000 --- a/src/main/kotlin/biokotlin/gff/Terminator.kt +++ /dev/null @@ -1,20 +0,0 @@ -package biokotlin.gff - -/** - * Also known as 3' UTR - */ -class Terminator( - seqid: String, - source: String, - start: Int, - end: Int, - score: Double = Double.NaN, - strand: String = "+", - phase: String = ".", - attributes: Map = emptyMap(), - children: List = emptyList() -) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { - - override fun type(): FeatureType = FeatureType.Terminator - -} \ No newline at end of file From 151ff64610d260a48e199d83c00050cc0304e823 Mon Sep 17 00:00:00 2001 From: Jeffrey Morse Date: Fri, 24 Jun 2022 15:04:25 -0400 Subject: [PATCH 19/19] Changes prior to meeting with Ed, where we decided to go a different direction --- .../kotlin/biokotlin/featureTree/Feature.kt | 73 ++++++++----------- .../biokotlin/featureTree/FeatureBuilder.kt | 27 ++++++- .../biokotlin/featureTree/FeatureTree.kt | 55 +++++++------- 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/src/main/kotlin/biokotlin/featureTree/Feature.kt b/src/main/kotlin/biokotlin/featureTree/Feature.kt index 46398665..cbe1a5e2 100644 --- a/src/main/kotlin/biokotlin/featureTree/Feature.kt +++ b/src/main/kotlin/biokotlin/featureTree/Feature.kt @@ -1,35 +1,33 @@ package biokotlin.featureTree -import kotlin.jvm.Throws - /** * @see Feature */ enum class FeatureType { - Gene, - Exon, + GENE, + EXON, /** * AKA 5' UTR */ - Leader, + LEADER, /** * AKA 3' UTR */ - Terminator, + TERMINATOR, /** - * AKA CDS + * AKA CoDing Sequence */ - Coding, + CDS, /** - * AKA transcript + * AKA mRNA */ - mRNA, - Intron, - Chromosome, - Scaffold; + TRANSCRIPT, + INTRON, + CHROMOSOME, + SCAFFOLD; companion object { /** @@ -37,15 +35,15 @@ enum class FeatureType { */ fun fromGffString(gffString: String): FeatureType { return when (gffString.lowercase()) { - "gene" -> Gene - "exon" -> Exon - "five_prime_utr" -> Leader - "three_prime_utr" -> Terminator - "cds" -> Coding - "mrna" -> mRNA - "intron" -> Intron - "chromosome" -> Chromosome - "scaffold" -> Scaffold + "gene" -> GENE + "exon" -> EXON + "five_prime_utr" -> LEADER + "three_prime_utr" -> TERMINATOR + "cds" -> CDS + "mrna" -> TRANSCRIPT + "intron" -> INTRON + "chromosome" -> CHROMOSOME + "scaffold" -> SCAFFOLD else -> throw Exception("Feature $gffString is not supported") } } @@ -218,7 +216,6 @@ class Feature( val ancestors = mutableListOf() ancestors.add(this) for (parent in parents()) { - println(parent) ancestors.addAll(parent.ancestors()) } return ancestors @@ -228,55 +225,55 @@ class Feature( * @return the first exon in this [Feature]'s ancestors, or null if there are none. */ fun exon(): Feature? { - return ancestors().find { it.type == FeatureType.Exon } + return ancestors().find { it.type == FeatureType.EXON } } /** * @return the first gene in this [Feature]'s ancestors, or null if there are none. */ fun gene(): Feature? { - return ancestors().find { it.type == FeatureType.Gene } + return ancestors().find { it.type == FeatureType.GENE } } /** * @return the first transcript in this [Feature]'s ancestors, or null if there are none. */ fun transcript(): Feature? { - return ancestors().find { it.type == FeatureType.mRNA } + return ancestors().find { it.type == FeatureType.TRANSCRIPT } } /** * @return the first scaffold in this [Feature]'s ancestors, or null if there are none. */ fun scaffold(): Feature? { - return ancestors().find { it.type == FeatureType.Scaffold } + return ancestors().find { it.type == FeatureType.SCAFFOLD } } /** * @return the first chromosome in this [Feature]'s ancestors, or null if there are none. */ fun chromosome(): Feature? { - return ancestors().find { it.type == FeatureType.Chromosome } + return ancestors().find { it.type == FeatureType.CHROMOSOME } } /** * @return the first contig in this [Feature]'s ancestors, or null if there are none. */ fun contig(): Feature? { - return ancestors().find { it.type == FeatureType.Scaffold || it.type == FeatureType.Chromosome } + return ancestors().find { it.type == FeatureType.SCAFFOLD || it.type == FeatureType.CHROMOSOME } } override fun nonRecursiveDot(): java.lang.StringBuilder { val sb = StringBuilder() - if (id() != null) sb.append("\"${hashCode()}\" [label=\"${id()}\\n$type $start-$end\" color = ${typeToColor(type)}]\n") - else sb.append("\"${hashCode()}\" [label=\"$type $start-$end\" color = ${typeToColor(type)}]\n") + if (id() != null) sb.append("\"${hashCode()}\" [label=\"${id()}\\n$type\\n$start-$end\" color = ${typeToColor(type)}]\n") + else sb.append("\"${hashCode()}\" [label=\"$type\\n$start-$end\" color = ${typeToColor(type)}]\n") for (child in children) sb.append("${hashCode()} -> ${child.hashCode()}\n") - for (parent in parents) sb.append("${hashCode()} -> ${parent.hashCode()}\n") + for (parent in parents) sb.append("${hashCode()} -> ${parent.hashCode()} [color = steelblue]\n") return sb } private fun typeToColor(type: FeatureType): Int { - return type.ordinal % 11 + 1 + return type.ordinal % 12 + 1 } } @@ -294,13 +291,7 @@ class FeatureComparator: Comparator { } } -fun invariantBroken(culprit: Any, number: Int) { - println("=========================================================================") - println("Invariant $number broken for class ${culprit::class}. culprit.toString():") - println(culprit.toString()) -} - fun main() { - val tree = FeatureTree.fromGff("/home/jeff/Buckler/Biokotlin/biokotlin/src/test/resources/biokotlin/featureTree/b73_shortened.gff") - println(tree.toDot()) + val tree = FeatureTree.fromGff("/home/jeff/Buckler/Biokotlin/b73.gff") + println(tree.genes()[0].toDot()) } \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt index e6eea51c..6c082d84 100644 --- a/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt +++ b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt @@ -3,6 +3,7 @@ package biokotlin.featureTree open class FeatureTreeBuilder { protected val children = mutableListOf() protected val builtChildren = mutableListOf() + protected var built: FeatureTree? = null fun addChild(child: FeatureBuilder) { children.add(child) @@ -14,9 +15,20 @@ open class FeatureTreeBuilder { } open fun build(): FeatureTree { + if (built != null) return built!! + val built = FeatureTree(children.map { it.build() }) for (builtChild in builtChildren) builtChild.addParent(built) builtChildren.clear() + this.built = built + return built + } + + open fun rebuild(): FeatureTree { + val built = FeatureTree(children.map { it.build() }) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built return built } } @@ -45,16 +57,29 @@ class FeatureBuilder( fun id() = attributes["ID"] + override fun build(): Feature { + if (built as? Feature != null) return built as Feature + + val children = children.map { it.build() } + val built = Feature(seqid, source, type, start, end, score, strand, phase, attributes, children) + for (parent in parents) parent.addBuiltChild(built) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built + return built + } + /** * @return an immutable tree representation of this [Feature]. To simultaneously build multiple top-level * elements into the same tree, use [buildFromList] */ - override fun build(): Feature { + override fun rebuild(): Feature { val children = children.map { it.build() } val built = Feature(seqid, source, type, start, end, score, strand, phase, attributes, children) for (parent in parents) parent.addBuiltChild(built) for (builtChild in builtChildren) builtChild.addParent(built) builtChildren.clear() + this.built = built return built } diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt index 7df27f1e..2bf1d3c6 100644 --- a/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt +++ b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt @@ -1,7 +1,6 @@ package biokotlin.featureTree import biokotlin.util.bufferedReader -import htsjdk.samtools.util.SequenceUtil.a /** * The root of a tree of GFF Features. Contains useful methods for parsing down a tree of such features. @@ -32,11 +31,13 @@ open class FeatureTree (children: List): Iterable { continue } + println(line) val split = line.split("\t") val attributes = split[8].split(";") val attributeMap = HashMap>(0) - attributes.forEach { - val tagValue = it.split("=") + for (attribute in attributes) { + val tagValue = attribute.split("=") + if (tagValue.size != 2) continue val tag = tagValue[0] val values = tagValue[1].split(",") attributeMap[tag] = values.toSet() @@ -65,7 +66,7 @@ open class FeatureTree (children: List): Iterable { if (registry.contains(parent)) { registry[parent]?.addChild(featureBuilder) } else { - TODO("Please use a well-formed GFF (meaning parents occur before all their children)") + TODO("Poor ordering not yet supported. List parents before their children.") } } } else { @@ -206,17 +207,7 @@ open class FeatureTree (children: List): Iterable { */ val byGap: Map> by lazy { groupBy {it.gap()} } - val size = toList().size - - val list: List by lazy { - val list = mutableListOf() - if (this is Feature) list.add(this) - - for (child in children) { - list.addAll(child.toList()) - } - list - } + fun size() = toList().size /** * Represents this FeatureTree as a GFF. @@ -238,10 +229,18 @@ open class FeatureTree (children: List): Iterable { /** * Converts the [FeatureTree] representation to a list in depth-first order. Elements that are not instances of - * [Feature] are ignored. + * [Feature] are ignored. When an element has multiple parents, it is only listed the first time that it is + * encountered in a depth-first traversal and ignored subsequently. */ fun toList(): List { - return list + val list = mutableListOf() + if (this is Feature) list.add(this) + + for (child in children) { + list.addAll(child.toList()) + } + //.distinct() is needed in case of multiple children. Somewhat significantly worsens asymptotic complexity. + return list.distinct() } /** @@ -429,70 +428,70 @@ open class FeatureTree (children: List): Iterable { * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds or chromosomes */ fun contigs(): List { - return filteredList { it.type == FeatureType.Scaffold || it.type == FeatureType.Chromosome } + return filteredList { it.type == FeatureType.SCAFFOLD || it.type == FeatureType.CHROMOSOME } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds */ fun scaffolds(): List { - return filteredList { it.type == FeatureType.Scaffold } + return filteredList { it.type == FeatureType.SCAFFOLD } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are chromosomes */ fun chromosomes(): List { - return filteredList { it.type == FeatureType.Chromosome } + return filteredList { it.type == FeatureType.CHROMOSOME } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are genes */ fun genes(): List { - return filteredList { it.type == FeatureType.Gene } + return filteredList { it.type == FeatureType.GENE } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are transcripts */ fun transcripts(): List { - return filteredList { it.type == FeatureType.mRNA } + return filteredList { it.type == FeatureType.TRANSCRIPT } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are exons */ fun exons(): List { - return filteredList { it.type == FeatureType.Exon } + return filteredList { it.type == FeatureType.EXON } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are introns */ fun introns(): List { - return filteredList { it.type == FeatureType.Intron } + return filteredList { it.type == FeatureType.INTRON } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are CDS */ fun codingSequences(): List { - return filteredList { it.type == FeatureType.Coding } + return filteredList { it.type == FeatureType.CDS } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are leaders */ fun leaders(): List { - return filteredList { it.type == FeatureType.Leader } + return filteredList { it.type == FeatureType.LEADER } } /** * @return A list of all elements in the tree rooted at this [FeatureTree] that are terminators */ fun terminators(): List { - return filteredList { it.type == FeatureType.Terminator } + return filteredList { it.type == FeatureType.TERMINATOR } } /** @@ -524,7 +523,7 @@ open class FeatureTree (children: List): Iterable { sb.append("digraph {\n") sb.append("rank = source\n") sb.append("ordering = out\n") - sb.append("node[shape = box style = filled colorscheme = set311]\n") + sb.append("node[shape = box style = filled colorscheme = set312]\n") if (this !is Feature) sb.append(nonRecursiveDot())