From 10f5fe4f64d90c216e1a8197fc6216c685abf4cd Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Tue, 28 Mar 2023 19:01:07 -0400 Subject: [PATCH 01/11] Simple flag for allowing non-atcg base chars to be present in UMI strings. --- .../com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 6 ++++-- .../fulcrumgenomics/umi/GroupReadsByUmiTest.scala | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 7c335fd9a..c7ac9960f 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -482,6 +482,7 @@ class GroupReadsByUmi @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, + @arg(flag='N', doc="Use non-ATCG bases (N) in clustering.") val includeNonATCG: Boolean = false, @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, @@ -551,7 +552,8 @@ class GroupReadsByUmi .filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false }) .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) - .filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false }) + .filter(r => + (this.includeNonATCG || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false }) .filter { r => this.minUmiLength.forall { l => r.get[String](this.rawTag).forall { umi => @@ -644,7 +646,7 @@ class GroupReadsByUmi logger.info(f"Accepted $kept%,d reads for grouping.") if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") - logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") + if (!this.includeNonATCG) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index beb9917ab..2f5c28c61 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -334,6 +334,20 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } + it should "include reads that contain an N in the UMI, if option is set." in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ANN")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".bam") + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Paired, edits = 2).execute() + + readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03") + } + + it should "exclude reads that contain an N in the UMI" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT")) From 5be3e228c9b4f3b42467fa699e0e1e70495612d7 Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Tue, 28 Mar 2023 19:12:04 -0400 Subject: [PATCH 02/11] Include UMIs which contain non-ATCG bases (N) in output. documentation for 'N' argument --- src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index c7ac9960f..4b47e80c3 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -482,7 +482,7 @@ class GroupReadsByUmi @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, - @arg(flag='N', doc="Use non-ATCG bases (N) in clustering.") val includeNonATCG: Boolean = false, + @arg(flag='N', doc="Include UMIs which contain non-ATCG bases (N) in output.") val includeNonATCG: Boolean = false, @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, From d6b314f5c457f3730bcd19abf789c4c1b9a553bc Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 7 Apr 2023 19:12:10 -0400 Subject: [PATCH 03/11] More comprehensive tests for UMIs containing Ns --- .../umi/GroupReadsByUmiTest.scala | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 2f5c28c61..bc759cdfa 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -334,6 +334,56 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } + it should "correctly group together single-end reads with UMIs containing N's ,if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA")) + builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC")) + builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 8 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 4 + groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) + } + + it should "correctly filter and not group together single-end reads with UMIs containing N's" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA")) + builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC")) + builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = false, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 7 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 4 + groups should contain theSameElementsAs Seq(Set("a01"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) + } + + + it should "include reads that contain an N in the UMI, if option is set." in { val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) From 746a3c6fc65517c626db40cac4d8e7261675f186 Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Tue, 11 Apr 2023 15:37:17 -0400 Subject: [PATCH 04/11] Adjust docstring & argument value for inclusion of Ns to better reflect the specific implementation. --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 29 ++++++++++--------- .../umi/GroupReadsByUmiTest.scala | 10 +++---- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 4b47e80c3..51b7c2d09 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -475,21 +475,22 @@ object Strategy extends FgBioEnum[Strategy] { """ ) class GroupReadsByUmi -( @arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, - @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, - @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, - @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX", - @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", - @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, - @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, - @arg(flag='N', doc="Include UMIs which contain non-ATCG bases (N) in output.") val includeNonATCG: Boolean = false, - @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, - @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, - @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, +(@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, + @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, + @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, + @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX", + @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", + @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, + @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, + @arg(flag='N', doc="Allow UMIs which contain Ns. Ns will be treated as mismatches to all other non-Ns") + val includeNs: Boolean = false, + @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, + @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, + @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths. |""") val minUmiLength: Option[Int] = None, - @arg(flag='x', doc= """ + @arg(flag='x', doc= """ |DEPRECATED: this option will be removed in future versions and inter-contig reads will be |automatically processed.""") @deprecated val allowInterContig: Boolean = true @@ -553,7 +554,7 @@ class GroupReadsByUmi .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) .filter(r => - (this.includeNonATCG || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false }) + (this.includeNs || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false }) .filter { r => this.minUmiLength.forall { l => r.get[String](this.rawTag).forall { umi => @@ -646,7 +647,7 @@ class GroupReadsByUmi logger.info(f"Accepted $kept%,d reads for grouping.") if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") - if (!this.includeNonATCG) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") + if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index bc759cdfa..f23391d11 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -334,7 +334,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } - it should "correctly group together single-end reads with UMIs containing N's ,if option is set" in { + it should "correctly group together single-end reads with UMIs containing N's, if option is set" in { val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN")) @@ -348,7 +348,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Edit, edits = 1).execute() + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute() val recs = readBamRecs(out) recs should have size 8 @@ -372,7 +372,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = false, strategy = Strategy.Edit, edits = 1).execute() + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = false, strategy = Strategy.Edit, edits = 1).execute() val recs = readBamRecs(out) recs should have size 7 @@ -388,11 +388,11 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) - builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ANN")) + builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACU")) val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".bam") - new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Paired, edits = 2).execute() + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute() readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03") } From 685e4d4d2ea296ded603ad316468add91fbe2ccd Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Tue, 11 Apr 2023 15:47:40 -0400 Subject: [PATCH 05/11] update usage docs to include relevant information about N flag & length counting of N's --- src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 51b7c2d09..39a7ff7d6 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -466,8 +466,9 @@ object Strategy extends FgBioEnum[Strategy] { | |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI |shorter than `len` will be discarded, and when comparing UMIs of different lengths, the first len bases will be - |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] bases in the UMI - |(i.e. does not count dashes and other non-ACGT characters). This option is not implemented for reads with UMI pairs + |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] + |([ACTGN], if the N flag is passed) bases in the UMI + |(i.e. does not count dashes and other non-ACGT(N) characters). This option is not implemented for reads with UMI pairs |(i.e. using the paired assigner). | |If the input is not template-coordinate sorted (i.e. `SO:unsorted GO:query SS:unsorted:template-coordinate`), then From cf5f1bc6891c7bbaa14bb4d1c3cb1e7e80330f0f Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Tue, 11 Apr 2023 19:24:00 -0400 Subject: [PATCH 06/11] formatting fix for docstrings --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 39a7ff7d6..70278cb0d 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -476,22 +476,22 @@ object Strategy extends FgBioEnum[Strategy] { """ ) class GroupReadsByUmi -(@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, - @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, - @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, - @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX", - @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", - @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, - @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, - @arg(flag='N', doc="Allow UMIs which contain Ns. Ns will be treated as mismatches to all other non-Ns") +( @arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, + @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, + @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, + @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX", + @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", + @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, + @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, + @arg(flag = 'N', doc = "Allow UMIs which contain Ns. Ns will be treated as mismatches to all other non-Ns") val includeNs: Boolean = false, - @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, - @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, - @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, + @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, + @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, + @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths. |""") val minUmiLength: Option[Int] = None, - @arg(flag='x', doc= """ + @arg(flag='x', doc= """ |DEPRECATED: this option will be removed in future versions and inter-contig reads will be |automatically processed.""") @deprecated val allowInterContig: Boolean = true From e28bd7bf1d757473d8ad5d495cc32a6aa67b7034 Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 29 Sep 2023 17:32:11 -0400 Subject: [PATCH 07/11] filter reads which have UMIs only containing N's --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 3 ++- .../umi/GroupReadsByUmiTest.scala | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 70278cb0d..b42f92a56 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -555,7 +555,7 @@ class GroupReadsByUmi .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) .filter(r => - (this.includeNs || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false }) + ((this.includeNs && !r.get[String](rawTag).forall(_.contains('N')) || !r.get[String](rawTag).exists(_.contains('N')))) || { filteredNsInUmi += 1; false }) .filter { r => this.minUmiLength.forall { l => r.get[String](this.rawTag).forall { umi => @@ -649,6 +649,7 @@ class GroupReadsByUmi if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") + if (this.includeNs && filteredNsInUmi > 0) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained only Ns in their UMIs.") this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index f23391d11..c6be9aa5c 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -358,6 +358,26 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } + it should "correctly group filter UMIs which are only N's, if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 1 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 1 + groups should contain theSameElementsAs Seq(Set("a02")) + } + + + it should "correctly filter and not group together single-end reads with UMIs containing N's" in { val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) From efda09e8f0b97874435d29e57563047ac729b82f Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 29 Sep 2023 18:12:01 -0400 Subject: [PATCH 08/11] cover "-" cases also --- .../umi/GroupReadsByUmiTest.scala | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index c6be9aa5c..a35180a16 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -358,7 +358,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } - it should "correctly group filter UMIs which are only N's, if option is set" in { + it should "correctly filter UMIs which are only N's, if option is set" in { val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN")) builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA")) @@ -376,6 +376,20 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a02")) } + it should "exclude reads that contain an only N's in the UMI, handle - also." in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "NNN-NNN")) + builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".bam") + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute() + + val recs = readBamRecs(out) + recs should have size 2 + + readBamRecs(out).map(_.name).distinct shouldBe Seq("a02") + } it should "correctly filter and not group together single-end reads with UMIs containing N's" in { @@ -403,7 +417,6 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT } - it should "include reads that contain an N in the UMI, if option is set." in { val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) From a57c9026bcfc2af3c1bebafe9ad22297e203e60b Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 29 Sep 2023 18:27:48 -0400 Subject: [PATCH 09/11] If UMI is all N's ( and - ) filter it out & log that. --- .../scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index b42f92a56..e93981417 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -546,6 +546,7 @@ class GroupReadsByUmi (this.edits == 0 || this.strategy == Strategy.Identity) } + // Filter and sort the input BAM file logger.info("Filtering the input.") val filteredIterator = in.iterator @@ -555,7 +556,10 @@ class GroupReadsByUmi .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) .filter(r => - ((this.includeNs && !r.get[String](rawTag).forall(_.contains('N')) || !r.get[String](rawTag).exists(_.contains('N')))) || { filteredNsInUmi += 1; false }) + (this.includeNs && !r.get[String](rawTag).exists{ umi => + umi.forall(c => c == 'N' || c == '-') + } || !r.get[String](rawTag).exists(_.contains('N'))) + || { filteredNsInUmi += 1; false }) .filter { r => this.minUmiLength.forall { l => r.get[String](this.rawTag).forall { umi => From 3a10e7b603b495e334c831ae160614dce907a8ea Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 29 Sep 2023 19:00:10 -0400 Subject: [PATCH 10/11] add test to ensure N is treated as mismatch --- .../umi/GroupReadsByUmiTest.scala | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index a35180a16..b77f1b6db 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -421,7 +421,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) - builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACU")) + builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACN")) val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".bam") @@ -430,6 +430,27 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03") } + it should "correctly group UMIs within the edit distance, if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAANN")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "AAAAANNN")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 2).execute() + + val recs = readBamRecs(out) + recs should have size 3 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 2 + groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03")) + + } + + it should "exclude reads that contain an N in the UMI" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) From e2e204990e978395243586385fb2993c3cc2ea86 Mon Sep 17 00:00:00 2001 From: Joe Vieira Date: Fri, 29 Sep 2023 19:00:55 -0400 Subject: [PATCH 11/11] count N in either position as a mismatch --- .../scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 2 +- src/main/scala/com/fulcrumgenomics/util/Sequences.scala | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index e93981417..044cbdb69 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -556,7 +556,7 @@ class GroupReadsByUmi .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) .filter(r => - (this.includeNs && !r.get[String](rawTag).exists{ umi => + (this.includeNs && !r.get[String](rawTag).exists { umi => umi.forall(c => c == 'N' || c == '-') } || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false }) diff --git a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala index 81412a859..3e53378b9 100644 --- a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala +++ b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala @@ -94,7 +94,9 @@ object Sequences { */ def gcContent(s: String): Double = if (s.isEmpty) 0 else SequenceUtil.calculateGc(s.getBytes) - /** Counts the number of mismatches between two sequences of the same length. */ + /** Counts the number of mismatches between two sequences of the same length. + * N's always count as mismatches + */ def countMismatches(s1: String, s2: String): Int = { require(s1.length == s2.length, s"Cannot count mismatches in strings of differing lengths: $s1 $s2") @@ -102,7 +104,7 @@ object Sequences { forloop (from=0, until=s1.length) { i => val a = Character.toUpperCase(s1.charAt(i)) val b = Character.toUpperCase(s2.charAt(i)) - if (a != b) count += 1 + if (a != b || a == 'N' || b == 'N') count += 1 } count