diff --git a/src/main/kotlin/biokotlin/featureTree/Feature.kt b/src/main/kotlin/biokotlin/featureTree/Feature.kt new file mode 100644 index 00000000..cbe1a5e2 --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/Feature.kt @@ -0,0 +1,297 @@ +package biokotlin.featureTree + +/** + * @see Feature + */ +enum class FeatureType { + GENE, + EXON, + /** + * AKA 5' UTR + */ + LEADER, + + /** + * AKA 3' UTR + */ + TERMINATOR, + + /** + * AKA CoDing Sequence + */ + CDS, + + /** + * AKA mRNA + */ + TRANSCRIPT, + INTRON, + CHROMOSOME, + SCAFFOLD; + + companion object { + /** + * Converts from standard names in GFF files to a FeatureType. Case-insensitive + */ + fun fromGffString(gffString: String): FeatureType { + return when (gffString.lowercase()) { + "gene" -> GENE + "exon" -> EXON + "five_prime_utr" -> LEADER + "three_prime_utr" -> TERMINATOR + "cds" -> CDS + "mrna" -> TRANSCRIPT + "intron" -> INTRON + "chromosome" -> CHROMOSOME + "scaffold" -> SCAFFOLD + else -> throw Exception("Feature $gffString is not supported") + } + } + } +} + +/** + * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain + * any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not + * contain unescaped whitespace and must not begin with an unescaped ">". + * @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that + * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, + * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type + * creating a new composite type that is a subclass of the type in the type column. + * @param type TODO + * @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param end The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics + * of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, + * and that P-values be used for ab initio gene prediction features. Represented by Double.NaN when no score exists + * for a particular feature. + * @param strand The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, + * and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant, + * but unknown. + * @param phase For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end + * (where the 5' end of the CDS is relative to the strand of the CDS feature) of the current CDS feature. For + * clarification the 5' end for CDS features on the plus strand is the feature's start and and the 5' end for CDS + * features on the minus strand is the feature's end. The phase is one of the integers 0, 1, or 2, indicating the + * number of bases forward from the start of the current CDS feature the next codon begins. A phase of "0" indicates + * that a codon begins on the first nucleotide of the CDS feature (i.e. 0 bases forward), a phase of "1" indicates + * that the codon begins at the second nucleotide of this CDS feature and a phase of "2" indicates that the codon + * begins at the third nucleotide of this region. Note that ‘Phase’ in the context of a GFF3 CDS feature should not + * be confused with the similar concept of frame that is also a common concept in bioinformatics. Frame is generally + * calculated as a value for a given base relative to the start of the complete open reading frame (ORF) or the + * codon (e.g. modulo 3) while CDS phase describes the start of the next codon relative to a given CDS feature. + * @param attributes A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated + * by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces + * are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be + * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. When there + * are multiple values for the same tag, they are separated by a comma. + * @param children TODO + * @see FeatureBuilder + */ +class Feature( + val seqid: String, + val source: String, + val type: FeatureType, + val start: Int, + val end: Int, + val score: Double = Double.NaN, + val strand: String = "+", + val phase: String = ".", + val attributes: Map> = emptyMap(), + children: List = emptyList(), +): FeatureTree(children) { + + private val parents = mutableListOf() + + /** + * Adds a parent. ONLY FOR USE IN BUILDERS! + */ + internal fun addParent(parent: FeatureTree) { + parents.add(parent) + } + + /** + * @return one of the parents of this [Feature] that is an instance of [Feature] or null if no such parent exists. + * This function is to provide convenience for features that are known to only have one parent. To prevent excessive + * casts, this function cannot access a top-level [FeatureTree] parent that is not an instance of [Feature]. To + * access this top level container, use [root]. + * @see root + * @see parents + */ + fun parent(): Feature? { + return parents.find{ it is Feature } as? Feature + } + + /** + * @return The parents of this [Feature] that are instances of [Feature] or an empty list if no such parents + * exist. To access a top-level [FeatureTree] that is not a [Feature], use [root]. + * @see root + * @see parent + */ + fun parents(): List { + return parents.filterIsInstance().sortedWith(FeatureComparator()) + } + + /** + * @return the top-level container that is an ancestor of this [Feature]. + */ + fun root(): FeatureTree { + for (feature in ancestors()) { + if (feature.parents.isEmpty()) return this + if (feature.parents[0] !is Feature) return feature.parents[0] + } + throw Exception("Malformed FeatureTree. Could not find root.") + } + + //Some attributes cannot have multiple values, so the .first() calls allows for more convenience + + fun id() = attributes["ID"]?.first() + + fun name() = attributes["Name"]?.first() + + fun alias() = attributes["Alias"] + + fun target() = attributes["Target"]?.first() + + fun gap() = attributes["Gap"]?.first() + + fun derivesFrom() = attributes["Derives_from"]?.first() + + fun note() = attributes["Note"] + + fun dbxref() = attributes["Dbxref"] + + fun ontologyTerm() = attributes["Ontology_term"] + + fun isCircular() = attributes["Is_circular"]?.first() + + /** + * Compares this to [other] alphabetically by seqid, then by start, then by end position. + * Returns zero if this and [other] are equal in ordering, a negative number if this is less + * than [other], or a positive number if this is greater than [other]. + */ + fun compareTo(other: Feature): Int { + return if (seqid.compareTo(other.seqid) == 0) { + if (start.compareTo(other.start) == 0) { + end.compareTo(other.end) + } else { + start.compareTo(other.start) + } + } else { + seqid.compareTo(other.seqid) + } + } + + /** + * @return The feature as a string representing row in a GFF file. + */ + override fun toString(): String { + val scoreString = if (score.isNaN()) { + "." + } else { + score.toString() + } + + val attributesString = StringBuilder() + for ((tag, set) in attributes) { + attributesString.append(tag).append("=") + for (value in set) { + attributesString.append(value).append(",") + } + } + //TODO naively doing type will not line up properly with the gff name + return "$seqid\t$source\t${type}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" + } + + /** + * @return a list containing this [Feature] and ancestors of this feature. The ordering is depth-first, + * left-to-right. Ancestors that are not instances of [Feature] are ignored. + */ + fun ancestors(): List { + val ancestors = mutableListOf() + ancestors.add(this) + for (parent in parents()) { + ancestors.addAll(parent.ancestors()) + } + return ancestors + } + + /** + * @return the first exon in this [Feature]'s ancestors, or null if there are none. + */ + fun exon(): Feature? { + return ancestors().find { it.type == FeatureType.EXON } + } + + /** + * @return the first gene in this [Feature]'s ancestors, or null if there are none. + */ + fun gene(): Feature? { + return ancestors().find { it.type == FeatureType.GENE } + } + + /** + * @return the first transcript in this [Feature]'s ancestors, or null if there are none. + */ + fun transcript(): Feature? { + return ancestors().find { it.type == FeatureType.TRANSCRIPT } + } + + /** + * @return the first scaffold in this [Feature]'s ancestors, or null if there are none. + */ + fun scaffold(): Feature? { + return ancestors().find { it.type == FeatureType.SCAFFOLD } + } + + /** + * @return the first chromosome in this [Feature]'s ancestors, or null if there are none. + */ + fun chromosome(): Feature? { + return ancestors().find { it.type == FeatureType.CHROMOSOME } + } + + /** + * @return the first contig in this [Feature]'s ancestors, or null if there are none. + */ + fun contig(): Feature? { + return ancestors().find { it.type == FeatureType.SCAFFOLD || it.type == FeatureType.CHROMOSOME } + } + + override fun nonRecursiveDot(): java.lang.StringBuilder { + val sb = StringBuilder() + if (id() != null) sb.append("\"${hashCode()}\" [label=\"${id()}\\n$type\\n$start-$end\" color = ${typeToColor(type)}]\n") + else sb.append("\"${hashCode()}\" [label=\"$type\\n$start-$end\" color = ${typeToColor(type)}]\n") + for (child in children) sb.append("${hashCode()} -> ${child.hashCode()}\n") + for (parent in parents) sb.append("${hashCode()} -> ${parent.hashCode()} [color = steelblue]\n") + return sb + } + + private fun typeToColor(type: FeatureType): Int { + return type.ordinal % 12 + 1 + } + +} + +/** + * Provides ordering for feature + */ +class FeatureComparator: Comparator { + /** + * Returns the same result as [p0].compareTo([p1]) unless one of the arguments is null, in which case it returns 0. + */ + override fun compare(p0: Feature?, p1: Feature?): Int { + if (p0 == null || p1 == null) return 0 + return p0.compareTo(p1) + } +} + +fun main() { + val tree = FeatureTree.fromGff("/home/jeff/Buckler/Biokotlin/b73.gff") + println(tree.genes()[0].toDot()) +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt new file mode 100644 index 00000000..6c082d84 --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/FeatureBuilder.kt @@ -0,0 +1,102 @@ +package biokotlin.featureTree + +open class FeatureTreeBuilder { + protected val children = mutableListOf() + protected val builtChildren = mutableListOf() + protected var built: FeatureTree? = null + + fun addChild(child: FeatureBuilder) { + children.add(child) + child.addParent(this) + } + + internal fun addBuiltChild(child: Feature) { + builtChildren.add(child) + } + + open fun build(): FeatureTree { + if (built != null) return built!! + + val built = FeatureTree(children.map { it.build() }) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built + return built + } + + open fun rebuild(): FeatureTree { + val built = FeatureTree(children.map { it.build() }) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built + return built + } +} + +/** + * A mutable representation of a genetic feature that can be built into a [Feature]. + * @see Feature + */ +class FeatureBuilder( + val seqid: String, + val source: String, + val type: FeatureType, + val start: Int, + val end: Int, + val score: Double = 0.0, + val strand: String = "+", + val phase: String = ".", + var attributes: Map> = emptyMap() //TODO make Set to account for multiple values in the same attribute +): FeatureTreeBuilder() { + + private val parents = mutableListOf() + + fun addParent(parent: FeatureTreeBuilder) { + parents.add(parent) + } + + fun id() = attributes["ID"] + + override fun build(): Feature { + if (built as? Feature != null) return built as Feature + + val children = children.map { it.build() } + val built = Feature(seqid, source, type, start, end, score, strand, phase, attributes, children) + for (parent in parents) parent.addBuiltChild(built) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built + return built + } + + /** + * @return an immutable tree representation of this [Feature]. To simultaneously build multiple top-level + * elements into the same tree, use [buildFromList] + */ + override fun rebuild(): Feature { + val children = children.map { it.build() } + val built = Feature(seqid, source, type, start, end, score, strand, phase, attributes, children) + for (parent in parents) parent.addBuiltChild(built) + for (builtChild in builtChildren) builtChild.addParent(built) + builtChildren.clear() + this.built = built + return built + } + + companion object { + /** + * @param list the top-level elements to be built. + * @return a built version of the trees represented by [list], combined into a single tree. + */ + fun buildFromList(list: List): FeatureTree { + if (list.isEmpty()) return FeatureTree(emptyList()) + if (list.size == 1) return list[0].build() + else { + val tree = FeatureTreeBuilder() + for (child in list) tree.addChild(child) + return tree.build() + } + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt new file mode 100644 index 00000000..2bf1d3c6 --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/FeatureTree.kt @@ -0,0 +1,546 @@ +package biokotlin.featureTree + +import biokotlin.util.bufferedReader + +/** + * The root of a tree of GFF Features. Contains useful methods for parsing down a tree of such features. + * A [Feature] is a subclass of [FeatureTree] with additional information that describes the properties of the + * [Feature]. Note that instances of [FeatureTree] that are not instances of [Feature] must be top-level roots and are + * ___excluded___ from most operations on the tree, including the iterator. This is because these roots are simply + * artificial containers that do not hold meaningful information from the GFF file. + * @see Feature + */ +open class FeatureTree (children: List): Iterable { + val children = children.sortedWith(FeatureComparator()) + + companion object { + /** + * @param gff a pathname of a gff file. + * @return a tree representing the features of [gff] + */ + fun fromGff(gff: String): FeatureTree { + + val file = bufferedReader(gff) + val registry = HashMap(0) //id to feature builder + val roots = mutableListOf() + + var line = file.readLine() + while (line != null) { + if (line.startsWith("#")) { + line = file.readLine() + continue + } + + println(line) + val split = line.split("\t") + val attributes = split[8].split(";") + val attributeMap = HashMap>(0) + for (attribute in attributes) { + val tagValue = attribute.split("=") + if (tagValue.size != 2) continue + val tag = tagValue[0] + val values = tagValue[1].split(",") + attributeMap[tag] = values.toSet() + } + val score = split[5].toDoubleOrNull() ?: Double.NaN + + val featureBuilder = FeatureBuilder( + split[0], + split[1], + FeatureType.fromGffString(split[2]), + split[3].toInt(), + split[4].toInt(), + score, + split[6], + split[7], + attributeMap + ) + + if (featureBuilder.attributes["ID"] != null) { + registry[featureBuilder.attributes["ID"]!!.first()] = featureBuilder + } + + if (featureBuilder.attributes["Parent"] != null) { + val parents = featureBuilder.attributes["Parent"]!! + for (parent in parents) { + if (registry.contains(parent)) { + registry[parent]?.addChild(featureBuilder) + } else { + TODO("Poor ordering not yet supported. List parents before their children.") + } + } + } else { + roots.add(featureBuilder) + } + + line = file.readLine() + } + + return FeatureBuilder.buildFromList(roots) + } + } + + /** + * Iterates over the tree rooted at this [FeatureTree] in depth-first order. Elements that are not instances + * of [Feature] are ignored. + */ + override fun iterator(): Iterator { + return toList().iterator() + } + + /** + * A map of all IDs of features in this [FeatureTree] to the feature with that ID. Features without IDs are excluded. + */ + val byID: Map by lazy { associateBy { it.id() } } + + /** + * A map that groups features in this [FeatureTree] by seqid. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val bySeqid: Map> by lazy { groupBy {it.seqid} } + + /** + * A map that groups features in this [FeatureTree] by source. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val bySource: Map> by lazy { groupBy {it.source} } + + /** + * A map that groups features in this [FeatureTree] by type. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byType: Map> by lazy { groupBy {it.type} } + + /** + * A map that groups features in this [FeatureTree] by start position. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byStart: Map> by lazy { groupBy {it.start} } + + /** + * A map that groups features in this [FeatureTree] by end position. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byEnd: Map> by lazy { groupBy {it.end} } + + /** + * A map that groups features in this [FeatureTree] by score. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byScore: Map> by lazy { groupBy {it.score} } + + /** + * A map that groups features in this [FeatureTree] by strand. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byStrand: Map> by lazy { groupBy {it.strand} } + + /** + * A map that groups features in this [FeatureTree] by phase. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byPhase: Map> by lazy { groupBy {it.phase} } + + /** + * A map that groups features in this [FeatureTree] by name. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byName: Map> by lazy { groupBy {it.name()} } + + /** + * A map that groups features in this [FeatureTree] by their alias(es). Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byAlias: Map, List> by lazy { groupBy {it.alias()} } + + /** + * A map that groups features in this [FeatureTree] by their parent ID(s). Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byParentID: Map, List> by lazy { groupBy {it.attributes["Parent"]} } + + /** + * A map that groups features in this [FeatureTree] by their parent features (sorted by [FeatureComparator]). + * Order within the value lists preserves the order in the iterator of this [FeatureTree]. + */ + val byParent: Map, List> by lazy { groupBy {it.parents()} } + + /** + * A map that groups features in this [FeatureTree] by their target. Order within the lists preserves the order in the + * iterator of this [FeatureTree]. + */ + val byTarget: Map> by lazy { groupBy {it.target()} } + + /** + * A map that groups features in this [FeatureTree] by their Derives_from attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byDerivesFrom: Map> by lazy { groupBy {it.derivesFrom()} } + + /** + * A map that groups features in this [FeatureTree] by their note. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byNote: Map, List> by lazy { groupBy {it.note()} } + + /** + * A map that groups features in this [FeatureTree] by their Dbxref attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byDbxref: Map, List> by lazy { groupBy {it.dbxref()} } + + /** + * A map that groups features in this [FeatureTree] by their Ontology_term attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byOntologyTerm: Map, List> by lazy { groupBy {it.ontologyTerm()} } + + /** + * A map that groups features in this [FeatureTree] by their Is_circular attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byIsCircular: Map> by lazy { groupBy {it.isCircular()} } + + /** + * A map that groups features in this [FeatureTree] by their Gap attribute. Order within the + * lists preserves the order in the iterator of this [FeatureTree]. + */ + val byGap: Map> by lazy { groupBy {it.gap()} } + + fun size() = toList().size + + /** + * Represents this FeatureTree as a GFF. + */ + override fun toString(): String { + return recursiveToString() + } + + /** + * Traverses this [FeatureTree] in depth-first order and appends to the [toString] of each [Feature]. + */ + fun recursiveToString(): String { + val sb = StringBuilder() + for (feature in this) { + sb.append(feature) + } + return sb.toString() + } + + /** + * Converts the [FeatureTree] representation to a list in depth-first order. Elements that are not instances of + * [Feature] are ignored. When an element has multiple parents, it is only listed the first time that it is + * encountered in a depth-first traversal and ignored subsequently. + */ + fun toList(): List { + val list = mutableListOf() + if (this is Feature) list.add(this) + + for (child in children) { + list.addAll(child.toList()) + } + //.distinct() is needed in case of multiple children. Somewhat significantly worsens asymptotic complexity. + return list.distinct() + } + + /** + * Groups elements of the original array by the key returned by the given [keySelector] function applied to each + * element and returns a map where each group key is associated with a list of corresponding elements. + * The returned map preserves the entry iteration order of the keys produced from the original [FeatureTree]. + * + * TODO code samples + * @param mapping The function that produces keys for a given feature + */ + fun groupBy(keySelector: (Feature) -> T?): Map> { + return groupBy(keySelector) { it } + } + + /** + * Groups values returned by the valueTransform function applied to each element of the original collection by the + * key returned by the given [keySelector] function applied to the element and returns a map where each group key is + * associated with a list of corresponding values. + * + * The returned map preserves the entry iteration order of the keys produced from the original [FeatureTree]. + * + * TODO code samples + */ + fun groupBy(keySelector: (Feature) -> T?, valueTransform: (Feature) -> V): Map> { + val map = mutableMapOf>() + for (feature in this) { + val key = keySelector(feature) ?: continue + val value = valueTransform(feature) + if (map.contains(key)) map[key]?.add(value) + else map[key] = mutableListOf(value) + } + return map + } + + /** + * Returns a Map containing key-value pairs provided by transform function applied to the [Feature]s of this + * [FeatureTree]. + * If any of two pairs would have the same key the last one gets added to the map. + * If [transform] returns null, no entry is added for that feature. + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associate(transform: (Feature) -> Pair?): Map { + val map = mutableMapOf() + for (feature in this) { + val pair = transform(feature) ?: continue + map[pair.first] = pair.second + } + return map + } + + /** + * Returns a Map containing the [Feature]s from this [FeatureTree] indexed by the key returned from [keySelector] + * function applied to each element. + * If any two elements would have the same key returned by keySelector the last one gets added to the map. + * + * If [keySelector] returns null, no entry is added for that feature. + * + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateBy(keySelector: (Feature) -> T?): Map { + return associateBy(keySelector) { it } + } + + /** + * Returns a Map containing the values provided by [valueTransform] and indexed by keySelector functions applied to + * elements of the given collection. If any two elements would have the same key returned by keySelector the last + * one gets added to the map. + * + * If [keySelector] returns null, no entry is added for that feature. + * + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateBy(keySelector: (Feature) -> T?, valueTransform: (Feature) -> V): Map { + return associate { + val key = keySelector(it) + if (key == null) null + else Pair(key, valueTransform(it)) + } + } + + /** + * Returns a Map where keys are elements from the given array and values are produced by the valueSelector + * function applied to each element. + * If any two elements are equal, the last one gets added to the map. + * The returned map preserves the entry iteration order of the original [FeatureTree]. + */ + fun associateWith(valueSelector: (Feature) -> T): Map { + return associate { Pair(it, valueSelector(it)) } + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains [feature]. False otherwise. + */ + fun contains(feature: Feature): Boolean { + return toList().contains(feature) + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains all elements in [elements]. False otherwise. + */ + fun containsAll(elements: Collection): Boolean { + return toList().containsAll(elements) + } + + /** + * @return True if the tree rooted at this [FeatureTree] contains no elements that are instances of [Feature]. + * False otherwise. + */ + fun isEmpty(): Boolean { + return toList().isEmpty() + } + + /** + * @return True if all elements in the tree rooted at this [FeatureTree] match the [predicate]. False otherwise. + */ + fun all(predicate: (Feature) -> Boolean): Boolean { + return toList().all(predicate) + } + + /** + * @return True if at least one element in the tree rooted at this [FeatureTree] matches the [predicate]. False + * otherwise. + */ + fun any(predicate: (Feature) -> Boolean): Boolean { + return toList().any(predicate) + } + + /** + * @return The number of elements in the tree rooted at this [FeatureTree] matching the [predicate] + */ + fun count(predicate: (Feature) -> Boolean): Int { + return toList().count(predicate) + } + + /** + * @return A list containing only elements of the tree rooted at this [FeatureTree] that match the given [predicate]. + * The order of the iterator of the original [FeatureTree] is preserved. + */ + fun filteredList(predicate: (Feature) -> Boolean): List { + return toList().filter(predicate) + } + + /** + * @return The first element in the tree rooted at this [FeatureTree] matching the given predicate, or null if no + * such element was found. + */ + fun find(predicate: (Feature) -> Boolean): Feature? { + return toList().find(predicate) + } + + /** + * @return The first element in the tree rooted at this [FeatureTree] matching the given predicate, or null if no + * such element was found. + */ + fun findLast(predicate: (Feature) -> Boolean): Feature? { + return toList().findLast(predicate) + } + + /** + * Performs the given operation on each element in the tree rooted at this [FeatureTree]. + */ + fun forEach(action: (Feature) -> Unit) { + toList().forEach(action) + } + + /** + * Accumulates value starting with [initial] value and applying [operation] from left to right to current accumulator + * value and each element. + * + * Returns the specified initial value if the tree rooted at [FeatureTree] is empty. + */ + fun fold(initial: R, operation: (acc: R, Feature) -> R): R { + return toList().fold(initial, operation) + } + + /** + * Returns the sum of all values produced by [selector] function applied to each element in the tree rooted at + * this [FeatureTree]. + */ + fun sumOf(selector: (Feature) -> Int): Int { + return toList().map(selector).fold(0) { acc, e -> acc + e } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds or chromosomes + */ + fun contigs(): List { + return filteredList { it.type == FeatureType.SCAFFOLD || it.type == FeatureType.CHROMOSOME } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are scaffolds + */ + fun scaffolds(): List { + return filteredList { it.type == FeatureType.SCAFFOLD } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are chromosomes + */ + fun chromosomes(): List { + return filteredList { it.type == FeatureType.CHROMOSOME } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are genes + */ + fun genes(): List { + return filteredList { it.type == FeatureType.GENE } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are transcripts + */ + fun transcripts(): List { + return filteredList { it.type == FeatureType.TRANSCRIPT } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are exons + */ + fun exons(): List { + return filteredList { it.type == FeatureType.EXON } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are introns + */ + fun introns(): List { + return filteredList { it.type == FeatureType.INTRON } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are CDS + */ + fun codingSequences(): List { + return filteredList { it.type == FeatureType.CDS } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are leaders + */ + fun leaders(): List { + return filteredList { it.type == FeatureType.LEADER } + } + + /** + * @return A list of all elements in the tree rooted at this [FeatureTree] that are terminators + */ + fun terminators(): List { + return filteredList { it.type == FeatureType.TERMINATOR } + } + + /** + * @return A Map containing the [Feature]s from this [FeatureTree] indexed by their value for [attribute]. + */ + fun associateByAttribute(attribute: String): Map, Feature> { + return associateBy { it.attributes[attribute] } + } + + fun groupByAttribute(attribute: String): Map, List> { + return groupBy { it.attributes[attribute] } + } + + /** + * @return a list of features with [seqid] and within the [start] and [end] range (inclusive). The order + * of the iterator for this [FeatureTree] is preserved. + */ + fun within(seqid: String, start: Int, end: Int): List { + //Efficiency of this can be improved because it can give up once the start locations + //become higher than end or when it becomes a different seqid + return filteredList { it.seqid == seqid && it.start >= start && it.end <= end } + } + + /** + * @return this [FeatureTree] as a DOT file, for visualization + */ + fun toDot(): String { + val sb = StringBuilder() + sb.append("digraph {\n") + sb.append("rank = source\n") + sb.append("ordering = out\n") + sb.append("node[shape = box style = filled colorscheme = set312]\n") + + if (this !is Feature) sb.append(nonRecursiveDot()) + + for (child in this) sb.append(child.nonRecursiveDot()) + sb.append("}") + return sb.toString() + } + + /** + * @return this [FeatureTree]'s label, as well as its relationship to its direct children and direct + * parents + */ + internal open fun nonRecursiveDot(): java.lang.StringBuilder { + val sb = StringBuilder() + sb.append("\"${hashCode()}\" [label=\"container\" color = gray]\n") + for (child in children) sb.append("${hashCode()} -> ${child.hashCode()}\n") + return sb + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt new file mode 100644 index 00000000..22f58ea4 --- /dev/null +++ b/src/main/kotlin/biokotlin/featureTree/MakeGffForNewSpecies.kt @@ -0,0 +1,437 @@ +package biokotlin.featureTree + +import biokotlin.util.bufferedWriter +import htsjdk.samtools.CigarOperator +import htsjdk.samtools.SAMRecord +import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.ValidationStringency +import java.io.File + +/** + * Feature operator classifiers + * * `U` = undefined (`S`, `H`) + * * `I` = intron (`N`) + * * `C` = coding (`M`, `D`, `I`**, `=`, `X`) <- `I` is ignored + */ +enum class FeatureOperator { + U, C, I +} + +/** + * Data class for transcript features + * * `length` = length of feature + * * `operator` = what is the feature type? (*See `FeatureOperator` for more info*) + */ +data class TranscriptFeatures( + val length: Int, + val operator: FeatureOperator, + var id: Int +) + +/** + * Data class for BED columns + * * `chrom` = contig name + * * `chromStart` = start coordinate for sequence considered + * * `chromEnd` = end coordinate for sequence considered + * * `name` = ID for range + * * `strand` = DNA strand (`+`, `-`, or `.`) + */ +data class BedFeatures( + val chrom: String, + val chromStart: Int, + val chromEnd: Int, + val name: String, + val strand: Char + ) + +/** + * Data class for GFF columns (descriptions from https://useast.ensembl.org/info/website/upload/gff3.html) + * * seqid - name of the chromosome or scaffold + * * source - name of the program that generated this feature, or the data source (database/project name) + * * type - type of feature //TODO check which types our features can be (presumably only a subset of all possible types) + * * start - start position (STARTS COUNTING AT 1) + * * end - end position (STARTS COUNTING AT 1, INCLUSIVE) + * * score - floating point value + * * strand - '+', '-', or '.' + * * phase - One of '.', '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, + * '1' that the second base is the first base of a codon, and so on.. + * * attributes - a map of tag-value pairs. Attribute tags with special meanings found at http://gmod.org/wiki/GFF3#GFF3_Format + * * key attributes: Parent, ID, Name (can be same as ID) + */ +data class GffFeatures( + val seqid: String, + val source: String, + val type: String, + val start: Int, + val end: Int, + val score: String, + val strand: Char, + val phase: Char, + val attributes: Map, +) { + /** + * Converts this GffFeature into a GFF row + */ + fun asRow(): String { + val sb = StringBuilder() + for ((tag, value) in attributes) { + sb.append("$tag=$value;") + } + return "$seqid\t$source\t$type\t$start\t$end\t$score\t$strand\t$phase\t$sb\n" + } +} + +// ZACK FUNCTIONS ---- + +data class TranscriptBpAlignmentStats(val xOrEqArray: MutableList, + val eqArray:MutableList, + var nCounts:Int = 0, + var dCounts:Int = 0, + var numAlignments:Int = 0 ) + +enum class ExonOperator { + L, C, T +} +val exonOperatorMap = mapOf('L' to ExonOperator.L,'C' to ExonOperator.C,'T' to ExonOperator.T) +data class ExonBoundary(val length: Int, val operator: ExonOperator) + +/** + * Function to take a single exon and parse out the operator and the length of that operation. + * It needs to have a running sub list of the number characters until it sees an operator. + * Then it will convert the temp String into an Int and save out the boundary. + */ +fun convertExonStringToBoundary(exonString: String) : List { + var runningCount = StringBuilder() + val exonBoundaries = mutableListOf() + for(element in exonString) { + //Check to see if we hit an operator + if(element in exonOperatorMap.keys) { + //if so, turn the runningCount into an Int and add the boundary to the list. + val countString = runningCount.toString() + check(!countString.isEmpty()) {"Error parsing exon name. No counts for operator"} + + exonBoundaries.add(ExonBoundary(countString.toInt(), exonOperatorMap[element]!!)) + runningCount.clear() + } + else { + runningCount.append("$element") + } + } + + return exonBoundaries +} + +/** + * Function to parse out the exon boundaries from the transcript name. + * They take the form: transcript_1234:100L24C-12C-90C10T + */ +fun parseExonBoundaries(transcriptName: String) : List> { + val exonStrings = transcriptName.split(":")[1].split("-") + + return exonStrings.map { convertExonStringToBoundary(it) } +} + +/** + * Compute CDS positions (Pair) + */ +fun computeCDSPositions(exonBoundaries:List>) : Pair { + val sizesOfOperators = exonBoundaries.flatten() + .groupBy { it.operator } + .map { Pair(it.key, it.value.map { currValue -> currValue.length }.sum()) } + .toMap() + + val leaderSize = sizesOfOperators[ExonOperator.L]?:0 + + val cdsSize = sizesOfOperators[ExonOperator.C]?:0 + + return Pair(leaderSize, leaderSize + cdsSize - 1) +} + +/** + * Function that actually creates the xOrEQ array and the eqArray. It also counts the number of N and D bps + */ +fun buildTranscriptBpAlignmentStats(samRecord: SAMRecord) : TranscriptBpAlignmentStats { + val xOrEqArray = Array(samRecord.readLength) { 0 } + val eqArray = Array(samRecord.readLength) { 0 } + + var nCounts = 0 + var dCounts = 0 + + var currentBp = 0 + val cigarElements = samRecord.cigar.cigarElements + + //Loop through the cigarElements + for(element in cigarElements) { + val operator = element.operator + val count = element.length + + if(operator== CigarOperator.N) { + nCounts+=count + } + else if(operator==CigarOperator.D) { + dCounts+=count + } + //Check to see if consumes query + else if(operator.consumesReadBases()) { + //If it consumes read bases, we can walk through the length of the CIGAR operator and set the position as a 1 + for(index in 0 until count) { + if(operator.isAlignment) { + xOrEqArray[currentBp] = 1 + } + + if (operator==CigarOperator.EQ) { + eqArray[currentBp] = 1 + } + + currentBp++ + } + } + + } + + //Check to see if it was reversed during alignment. If so we need to flip our arrays. + return if(samRecord.readNegativeStrandFlag) { + TranscriptBpAlignmentStats(xOrEqArray.reversed().toMutableList(), eqArray.reversed().toMutableList(), nCounts, dCounts,1) + } + else { + TranscriptBpAlignmentStats(xOrEqArray.toMutableList(), eqArray.toMutableList(),nCounts, dCounts,1) + } + +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param featureLengths A list of `TranscriptFeature` data objects + * @param samRecord A `SAMRecord` object + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun calculateRanges(featureLengths: List, samRecord: SAMRecord, taxaId: String): MutableList { + var aggLength = samRecord.alignmentStart + val contig = samRecord.referenceName + val bedFeatures = mutableListOf() + val zmTranscriptRanges = parseExonBoundaries(samRecord.readName).flatten() + val strand = when { + samRecord.readNegativeStrandFlag -> '-' + else -> '+' + } + val prefixReadName = samRecord.readName.substringBefore(":") + + for (feature in featureLengths) { + if (feature.operator == FeatureOperator.C || feature.operator == FeatureOperator.I) { + bedFeatures.add( + BedFeatures( + contig, + (aggLength) - 1, + (aggLength + feature.length - 1), + "${taxaId}_${prefixReadName}_${feature.operator}.${feature.id}", + strand + ) + ) + aggLength += feature.length + } else { + continue + } + } + return bedFeatures +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param samRecord A SAM record entry + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun buildFeatureRanges(samRecord: SAMRecord, taxaId: String): MutableList { + val cigarElements = samRecord.cigar.cigarElements + val transcriptFeatures = mutableListOf() + + // Loop through CIGAR elements + var sumLength = 0 + var i = 0 + var iCount = 1 + var cCount = 1 + var uCount = 1 + for (element in cigarElements) { + val operator = element.operator + + if (operator != CigarOperator.I) { + sumLength += element.length + } + // FIX - change to last iterator [prev: element == cigarElements.last()] + if (i == cigarElements.lastIndex && (operator.isAlignment || operator == CigarOperator.D)) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + break + } + // TODO: 2/6/2022 - how to handle I/D elements? + when (operator) { + CigarOperator.S, CigarOperator.H -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.U, uCount)) + sumLength = 0 + uCount++ + } + CigarOperator.N -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.I, iCount)) + sumLength = 0 + iCount++ + } + else -> { + if (cigarElements[i + 1].operator == CigarOperator.N || cigarElements[i + 1].operator.isClipping) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + cCount++ + } + } + + } + + i++ + } + + // Check to see if it was reversed during alignment. If so, we need to reverse the ID order (e.g. 1-18 -> 18-1). + val featureLengths = when { + samRecord.readNegativeStrandFlag -> { + val idList = mutableListOf() + transcriptFeatures.forEach { idList.add(it.id) } + val revIdList = idList.reversed() + var j = 0 + transcriptFeatures.forEach { + it.id = revIdList[j] + j++ + } + transcriptFeatures.toMutableList() + } + else -> { + transcriptFeatures + } + } + + return calculateRanges(featureLengths, samRecord, taxaId) +} + +/** + * Writes a BED file based on a SAM file. Generates coordinates for + * introns (I), coding regions (C), and whole transcripts. IDs (e.g. C.1) + * are based on strand (+, -). + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minAlignmentPercent Percent match threshold to alignment region. + */ +fun writeBedFile(samFile: String, outputFile: String, taxaId: String, minAlignmentPercent: Double = 0.90) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minAlignmentPercent) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges + val transId = "${currentRecord.referenceName}\t${currentRecord.alignmentStart - 1}\t${currentRecord.alignmentEnd}\t${taxaId}_${currentRecord.readName.substringBefore(":")}\t0\t${bedStats[0].strand}\n" + bw.write(transId) + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} + +/** + * Writes a GFF file based on the SAM file. + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minQuality Percent match threshold to alignment region. + * @param maxNumber //TODO how to define this? + */ +fun writeGffFile(samFile: String, outputFile: String, taxaId: String, minQuality: Double = 0.90, maxNumber: Int = 1) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minQuality) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges //TODO make these GFF + + val topID = "${taxaId}_${currentRecord.readName.substringBefore(":")}"; + //TODO does this parent represent mRNA? + //TODO fill in the values + val mRNA = GffFeatures( + currentRecord.referenceName, + "SOURCE", + "mRNA", + currentRecord.alignmentStart, + currentRecord.alignmentEnd, + ".", + bedStats[0].strand, + '.', + mapOf("ID" to topID, "Name" to topID) + ) + bw.write(mRNA.asRow()) + //TODO key line to change + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/util/FileIO.kt b/src/main/kotlin/biokotlin/util/FileIO.kt index b4a92097..7eba853d 100644 --- a/src/main/kotlin/biokotlin/util/FileIO.kt +++ b/src/main/kotlin/biokotlin/util/FileIO.kt @@ -2,10 +2,10 @@ package biokotlin.util -import java.io.BufferedReader -import java.io.File +import java.io.* import java.net.URL import java.util.zip.GZIPInputStream +import java.util.zip.GZIPOutputStream fun bufferedReader(filename: String): BufferedReader { return try { @@ -21,6 +21,18 @@ fun bufferedReader(filename: String): BufferedReader { File(filename).bufferedReader() } } catch (e: Exception) { - throw IllegalStateException("FileIO: getBufferedReader: problem getting reader: " + e.message) + throw IllegalStateException("FileIO: bufferedReader: problem getting reader: " + e.message) + } +} + +fun bufferedWriter(filename: String, append: Boolean = false): BufferedWriter { + return try { + if (filename.endsWith(".gz")) { + BufferedWriter(OutputStreamWriter(GZIPOutputStream(FileOutputStream(filename, append)))) + } else { + BufferedWriter(OutputStreamWriter(FileOutputStream(filename, append))) + } + } catch (e: Exception) { + throw IllegalStateException("FileIO: bufferedWriter: problem getting writer: " + e.message) } } \ No newline at end of file