Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to maintain N in umi #907

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
17 changes: 13 additions & 4 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -466,8 +466,9 @@ object Strategy extends FgBioEnum[Strategy] {
|
|By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
|shorter than `len` will be discarded, and when comparing UMIs of different lengths, the first len bases will be
|compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] bases in the UMI
|(i.e. does not count dashes and other non-ACGT characters). This option is not implemented for reads with UMI pairs
|compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT]
|([ACTGN], if the N flag is passed) bases in the UMI
|(i.e. does not count dashes and other non-ACGT(N) characters). This option is not implemented for reads with UMI pairs
|(i.e. using the paired assigner).
|
|If the input is not template-coordinate sorted (i.e. `SO:unsorted GO:query SS:unsorted:template-coordinate`), then
Expand All @@ -482,6 +483,8 @@ class GroupReadsByUmi
@arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
@arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1,
@arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false,
@arg(flag = 'N', doc = "Allow UMIs which contain Ns. Ns will be treated as mismatches to all other non-Ns")
val includeNs: Boolean = false,
@arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy,
@arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
@arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
Expand Down Expand Up @@ -543,6 +546,7 @@ class GroupReadsByUmi
(this.edits == 0 || this.strategy == Strategy.Identity)
}


// Filter and sort the input BAM file
logger.info("Filtering the input.")
val filteredIterator = in.iterator
Expand All @@ -551,7 +555,11 @@ class GroupReadsByUmi
.filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false })
.filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
.filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false })
.filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false })
.filter(r =>
(this.includeNs && !r.get[String](rawTag).exists { umi =>
umi.forall(c => c == 'N' || c == '-')
} || !r.get[String](rawTag).exists(_.contains('N')))
|| { filteredNsInUmi += 1; false })
.filter { r =>
this.minUmiLength.forall { l =>
r.get[String](this.rawTag).forall { umi =>
Expand Down Expand Up @@ -644,7 +652,8 @@ class GroupReadsByUmi
logger.info(f"Accepted $kept%,d reads for grouping.")
if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
if (this.includeNs && filteredNsInUmi > 0) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained only Ns in their UMIs.")
this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
}

Expand Down
6 changes: 4 additions & 2 deletions src/main/scala/com/fulcrumgenomics/util/Sequences.scala
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,17 @@ object Sequences {
*/
def gcContent(s: String): Double = if (s.isEmpty) 0 else SequenceUtil.calculateGc(s.getBytes)

/** Counts the number of mismatches between two sequences of the same length. */
/** Counts the number of mismatches between two sequences of the same length.
* N's always count as mismatches
*/
def countMismatches(s1: String, s2: String): Int = {
require(s1.length == s2.length, s"Cannot count mismatches in strings of differing lengths: $s1 $s2")

var count = 0
forloop (from=0, until=s1.length) { i =>
val a = Character.toUpperCase(s1.charAt(i))
val b = Character.toUpperCase(s2.charAt(i))
if (a != b) count += 1
if (a != b || a == 'N' || b == 'N') count += 1
}

count
Expand Down
118 changes: 118 additions & 0 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,124 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}

it should "correctly group together single-end reads with UMIs containing N's, if option is set" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute()

val recs = readBamRecs(out)
recs should have size 8

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 4
groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}

it should "correctly filter UMIs which are only N's, if option is set" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute()

val recs = readBamRecs(out)
recs should have size 1

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 1
groups should contain theSameElementsAs Seq(Set("a02"))
}

it should "exclude reads that contain an only N's in the UMI, handle - also." in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "NNN-NNN"))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".bam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute()

val recs = readBamRecs(out)
recs should have size 2

readBamRecs(out).map(_.name).distinct shouldBe Seq("a02")
}


it should "correctly filter and not group together single-end reads with UMIs containing N's" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = false, strategy = Strategy.Edit, edits = 1).execute()

val recs = readBamRecs(out)
recs should have size 7

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 4
groups should contain theSameElementsAs Seq(Set("a01"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}


it should "include reads that contain an N in the UMI, if option is set." in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACN"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".bam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute()

readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03")
}

it should "correctly group UMIs within the edit distance, if option is set" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAANN"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "AAAAANNN"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 2).execute()

val recs = readBamRecs(out)
recs should have size 3

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 2
groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03"))

}



it should "exclude reads that contain an N in the UMI" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))
Expand Down