From 10f5fe4f64d90c216e1a8197fc6216c685abf4cd Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Tue, 28 Mar 2023 19:01:07 -0400
Subject: [PATCH 01/11] Simple flag for allowing non-atcg base chars to be
 present in UMI strings.

---
 .../com/fulcrumgenomics/umi/GroupReadsByUmi.scala  |  6 ++++--
 .../fulcrumgenomics/umi/GroupReadsByUmiTest.scala  | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 7c335fd9a..c7ac9960f 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -482,6 +482,7 @@ class GroupReadsByUmi
   @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
   @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
   @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
+  @arg(flag='N', doc="Use non-ATCG bases (N) in clustering.")             val includeNonATCG: Boolean = false,
   @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
   @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
   @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
@@ -551,7 +552,8 @@ class GroupReadsByUmi
       .filter(r => (r.mapped || (r.paired && r.mateMapped))                         || { filteredPoorAlignment += 1; false })
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
-      .filter(r => !r.get[String](rawTag).exists(_.contains('N'))                   || { filteredNsInUmi += 1; false })
+      .filter(r =>
+        (this.includeNonATCG || !r.get[String](rawTag).exists(_.contains('N')))     || { filteredNsInUmi += 1; false })
       .filter { r =>
         this.minUmiLength.forall { l =>
           r.get[String](this.rawTag).forall { umi =>
@@ -644,7 +646,7 @@ class GroupReadsByUmi
     logger.info(f"Accepted $kept%,d reads for grouping.")
     if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
     logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
-    logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
+    if (!this.includeNonATCG) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
     this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
   }
 
diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index beb9917ab..2f5c28c61 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -334,6 +334,20 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
   }
 
+  it should "include reads that contain an N in the UMI, if option is set." in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
+    builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
+    builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ANN"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".bam")
+    new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Paired, edits = 2).execute()
+
+    readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03")
+  }
+
+
   it should "exclude reads that contain an N in the UMI" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
     builder.addPair(name="a01", start1=100, start2=300, strand1=Plus,  strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))

From 5be3e228c9b4f3b42467fa699e0e1e70495612d7 Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Tue, 28 Mar 2023 19:12:04 -0400
Subject: [PATCH 02/11] Include UMIs which contain non-ATCG bases (N) in
 output. documentation for 'N' argument

---
 src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index c7ac9960f..4b47e80c3 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -482,7 +482,7 @@ class GroupReadsByUmi
   @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
   @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
   @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
-  @arg(flag='N', doc="Use non-ATCG bases (N) in clustering.")             val includeNonATCG: Boolean = false,
+  @arg(flag='N', doc="Include UMIs which contain non-ATCG bases (N) in output.")             val includeNonATCG: Boolean = false,
   @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
   @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
   @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,

From d6b314f5c457f3730bcd19abf789c4c1b9a553bc Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 7 Apr 2023 19:12:10 -0400
Subject: [PATCH 03/11] More comprehensive tests for UMIs containing Ns

---
 .../umi/GroupReadsByUmiTest.scala             | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index 2f5c28c61..bc759cdfa 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -334,6 +334,56 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
   }
 
+  it should "correctly group together single-end reads with UMIs containing N's ,if option is set" in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
+    builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
+    builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
+    builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
+    builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
+    builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
+    builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
+    builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".sam")
+    val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Edit, edits = 1).execute()
+
+    val recs = readBamRecs(out)
+    recs should have size 8
+
+    val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
+    groups should have size 4
+    groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
+  }
+
+  it should "correctly filter and not group together single-end reads with UMIs containing N's" in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
+    builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
+    builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
+    builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
+    builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
+    builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
+    builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
+    builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".sam")
+    val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = false, strategy = Strategy.Edit, edits = 1).execute()
+
+    val recs = readBamRecs(out)
+    recs should have size 7
+
+    val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
+    groups should have size 4
+    groups should contain theSameElementsAs Seq(Set("a01"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
+  }
+
+
+
   it should "include reads that contain an N in the UMI, if option is set." in {
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))

From 746a3c6fc65517c626db40cac4d8e7261675f186 Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Tue, 11 Apr 2023 15:37:17 -0400
Subject: [PATCH 04/11] Adjust docstring & argument value for inclusion of Ns
 to better reflect the specific implementation.

---
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 29 ++++++++++---------
 .../umi/GroupReadsByUmiTest.scala             | 10 +++----
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 4b47e80c3..51b7c2d09 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -475,21 +475,22 @@ object Strategy extends FgBioEnum[Strategy] {
   """
 )
 class GroupReadsByUmi
-( @arg(flag='i', doc="The input BAM file.")              val input: PathToBam  = Io.StdIn,
-  @arg(flag='o', doc="The output BAM file.")             val output: PathToBam = Io.StdOut,
-  @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
-  @arg(flag='t', doc="The tag containing the raw UMI.")  val rawTag: String    = "RX",
-  @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
-  @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
-  @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
-  @arg(flag='N', doc="Include UMIs which contain non-ATCG bases (N) in output.")             val includeNonATCG: Boolean = false,
-  @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
-  @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
-  @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
+(@arg(flag='i', doc="The input BAM file.")              val input: PathToBam  = Io.StdIn,
+ @arg(flag='o', doc="The output BAM file.")             val output: PathToBam = Io.StdOut,
+ @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
+ @arg(flag='t', doc="The tag containing the raw UMI.")  val rawTag: String    = "RX",
+ @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
+ @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
+ @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
+ @arg(flag='N', doc="Allow UMIs which contain Ns.  Ns will be treated as mismatches to all other non-Ns")
+  val includeNs: Boolean = false,
+ @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
+ @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
+ @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
                        |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
                        |""")
   val minUmiLength: Option[Int] = None,
-  @arg(flag='x', doc= """
+ @arg(flag='x', doc= """
                        |DEPRECATED: this option will be removed in future versions and inter-contig reads will be
                        |automatically processed.""")
   @deprecated val allowInterContig: Boolean = true
@@ -553,7 +554,7 @@ class GroupReadsByUmi
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
       .filter(r =>
-        (this.includeNonATCG || !r.get[String](rawTag).exists(_.contains('N')))     || { filteredNsInUmi += 1; false })
+        (this.includeNs || !r.get[String](rawTag).exists(_.contains('N')))     || { filteredNsInUmi += 1; false })
       .filter { r =>
         this.minUmiLength.forall { l =>
           r.get[String](this.rawTag).forall { umi =>
@@ -646,7 +647,7 @@ class GroupReadsByUmi
     logger.info(f"Accepted $kept%,d reads for grouping.")
     if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
     logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
-    if (!this.includeNonATCG) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
+    if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
     this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
   }
 
diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index bc759cdfa..f23391d11 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -334,7 +334,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
   }
 
-  it should "correctly group together single-end reads with UMIs containing N's ,if option is set" in {
+  it should "correctly group together single-end reads with UMIs containing N's, if option is set" in {
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
     builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
@@ -348,7 +348,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     val in = builder.toTempFile()
     val out = Files.createTempFile("umi_grouped.", ".sam")
     val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
-    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Edit, edits = 1).execute()
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute()
 
     val recs = readBamRecs(out)
     recs should have size 8
@@ -372,7 +372,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     val in = builder.toTempFile()
     val out = Files.createTempFile("umi_grouped.", ".sam")
     val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
-    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = false, strategy = Strategy.Edit, edits = 1).execute()
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = false, strategy = Strategy.Edit, edits = 1).execute()
 
     val recs = readBamRecs(out)
     recs should have size 7
@@ -388,11 +388,11 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
     builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
-    builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ANN"))
+    builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACU"))
 
     val in = builder.toTempFile()
     val out = Files.createTempFile("umi_grouped.", ".bam")
-    new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Paired, edits = 2).execute()
+    new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute()
 
     readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03")
   }

From 685e4d4d2ea296ded603ad316468add91fbe2ccd Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Tue, 11 Apr 2023 15:47:40 -0400
Subject: [PATCH 05/11] update usage docs to include relevant information about
 N flag & length counting of N's

---
 src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 51b7c2d09..39a7ff7d6 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -466,8 +466,9 @@ object Strategy extends FgBioEnum[Strategy] {
     |
     |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
     |shorter than `len` will be discarded, and when comparing UMIs of different lengths, the first len bases will be
-    |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] bases in the UMI
-    |(i.e. does not count dashes and other non-ACGT characters). This option is not implemented for reads with UMI pairs
+    |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT]
+    |([ACTGN], if the N flag is passed) bases in the UMI
+    |(i.e. does not count dashes and other non-ACGT(N) characters). This option is not implemented for reads with UMI pairs
     |(i.e. using the paired assigner).
     |
     |If the input is not template-coordinate sorted (i.e. `SO:unsorted GO:query SS:unsorted:template-coordinate`), then

From cf5f1bc6891c7bbaa14bb4d1c3cb1e7e80330f0f Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Tue, 11 Apr 2023 19:24:00 -0400
Subject: [PATCH 06/11] formatting fix for docstrings

---
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 39a7ff7d6..70278cb0d 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -476,22 +476,22 @@ object Strategy extends FgBioEnum[Strategy] {
   """
 )
 class GroupReadsByUmi
-(@arg(flag='i', doc="The input BAM file.")              val input: PathToBam  = Io.StdIn,
- @arg(flag='o', doc="The output BAM file.")             val output: PathToBam = Io.StdOut,
- @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
- @arg(flag='t', doc="The tag containing the raw UMI.")  val rawTag: String    = "RX",
- @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
- @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
- @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
- @arg(flag='N', doc="Allow UMIs which contain Ns.  Ns will be treated as mismatches to all other non-Ns")
+( @arg(flag='i', doc="The input BAM file.")              val input: PathToBam  = Io.StdIn,
+  @arg(flag='o', doc="The output BAM file.")             val output: PathToBam = Io.StdOut,
+  @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
+  @arg(flag='t', doc="The tag containing the raw UMI.")  val rawTag: String    = "RX",
+  @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
+  @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
+  @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
+  @arg(flag = 'N', doc = "Allow UMIs which contain Ns.  Ns will be treated as mismatches to all other non-Ns")
   val includeNs: Boolean = false,
- @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
- @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
- @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
+  @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
+  @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
+  @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
                        |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
                        |""")
   val minUmiLength: Option[Int] = None,
- @arg(flag='x', doc= """
+  @arg(flag='x', doc= """
                        |DEPRECATED: this option will be removed in future versions and inter-contig reads will be
                        |automatically processed.""")
   @deprecated val allowInterContig: Boolean = true

From e28bd7bf1d757473d8ad5d495cc32a6aa67b7034 Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 29 Sep 2023 17:32:11 -0400
Subject: [PATCH 07/11] filter reads which have UMIs only containing N's

---
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala |  3 ++-
 .../umi/GroupReadsByUmiTest.scala             | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 70278cb0d..b42f92a56 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -555,7 +555,7 @@ class GroupReadsByUmi
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
       .filter(r =>
-        (this.includeNs || !r.get[String](rawTag).exists(_.contains('N')))     || { filteredNsInUmi += 1; false })
+        ((this.includeNs && !r.get[String](rawTag).forall(_.contains('N'))  || !r.get[String](rawTag).exists(_.contains('N'))))     || { filteredNsInUmi += 1; false })
       .filter { r =>
         this.minUmiLength.forall { l =>
           r.get[String](this.rawTag).forall { umi =>
@@ -649,6 +649,7 @@ class GroupReadsByUmi
     if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
     logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
     if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
+    if (this.includeNs && filteredNsInUmi > 0) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained only Ns in their UMIs.")
     this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
   }
 
diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index f23391d11..c6be9aa5c 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -358,6 +358,26 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
   }
 
+  it should "correctly group filter UMIs which are only N's, if option is set" in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN"))
+    builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".sam")
+    val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute()
+
+    val recs = readBamRecs(out)
+    recs should have size 1
+
+    val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
+    groups should have size 1
+    groups should contain theSameElementsAs Seq(Set("a02"))
+  }
+
+
+
   it should "correctly filter and not group together single-end reads with UMIs containing N's" in {
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))

From efda09e8f0b97874435d29e57563047ac729b82f Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 29 Sep 2023 18:12:01 -0400
Subject: [PATCH 08/11] cover "-" cases also

---
 .../umi/GroupReadsByUmiTest.scala               | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index c6be9aa5c..a35180a16 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -358,7 +358,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
   }
 
-  it should "correctly group filter UMIs which are only N's, if option is set" in {
+  it should "correctly filter UMIs which are only N's, if option is set" in {
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN"))
     builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
@@ -376,6 +376,20 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     groups should contain theSameElementsAs Seq(Set("a02"))
   }
 
+  it should "exclude reads that contain an only N's in the UMI, handle - also." in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "NNN-NNN"))
+    builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".bam")
+    new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute()
+
+    val recs = readBamRecs(out)
+    recs should have size 2
+
+    readBamRecs(out).map(_.name).distinct shouldBe Seq("a02")
+  }
 
 
   it should "correctly filter and not group together single-end reads with UMIs containing N's" in {
@@ -403,7 +417,6 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
   }
 
 
-
   it should "include reads that contain an N in the UMI, if option is set." in {
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))

From a57c9026bcfc2af3c1bebafe9ad22297e203e60b Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 29 Sep 2023 18:27:48 -0400
Subject: [PATCH 09/11] If UMI is all N's ( and - ) filter it out & log that.

---
 .../scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index b42f92a56..e93981417 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -546,6 +546,7 @@ class GroupReadsByUmi
         (this.edits == 0 || this.strategy == Strategy.Identity)
     }
 
+
     // Filter and sort the input BAM file
     logger.info("Filtering the input.")
     val filteredIterator = in.iterator
@@ -555,7 +556,10 @@ class GroupReadsByUmi
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
       .filter(r =>
-        ((this.includeNs && !r.get[String](rawTag).forall(_.contains('N'))  || !r.get[String](rawTag).exists(_.contains('N'))))     || { filteredNsInUmi += 1; false })
+        (this.includeNs && !r.get[String](rawTag).exists{ umi =>
+          umi.forall(c => c == 'N' || c == '-')
+        } || !r.get[String](rawTag).exists(_.contains('N')))
+          || { filteredNsInUmi += 1; false })
       .filter { r =>
         this.minUmiLength.forall { l =>
           r.get[String](this.rawTag).forall { umi =>

From 3a10e7b603b495e334c831ae160614dce907a8ea Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 29 Sep 2023 19:00:10 -0400
Subject: [PATCH 10/11] add test to ensure N is treated as mismatch

---
 .../umi/GroupReadsByUmiTest.scala             | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index a35180a16..b77f1b6db 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -421,7 +421,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
     builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
-    builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACU"))
+    builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACN"))
 
     val in = builder.toTempFile()
     val out = Files.createTempFile("umi_grouped.", ".bam")
@@ -430,6 +430,27 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03")
   }
 
+  it should "correctly group UMIs within the edit distance, if option is set" in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAANN"))
+    builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
+    builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "AAAAANNN"))
+
+    val in = builder.toTempFile()
+    val out = Files.createTempFile("umi_grouped.", ".sam")
+    val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
+    new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 2).execute()
+
+    val recs = readBamRecs(out)
+    recs should have size 3
+
+    val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
+    groups should have size 2
+    groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03"))
+
+  }
+
+
 
   it should "exclude reads that contain an N in the UMI" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))

From e2e204990e978395243586385fb2993c3cc2ea86 Mon Sep 17 00:00:00 2001
From: Joe Vieira <joe.vieira@gmail.com>
Date: Fri, 29 Sep 2023 19:00:55 -0400
Subject: [PATCH 11/11] count N in either position as a mismatch

---
 .../scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala     | 2 +-
 src/main/scala/com/fulcrumgenomics/util/Sequences.scala     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index e93981417..044cbdb69 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -556,7 +556,7 @@ class GroupReadsByUmi
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
       .filter(r =>
-        (this.includeNs && !r.get[String](rawTag).exists{ umi =>
+        (this.includeNs && !r.get[String](rawTag).exists { umi =>
           umi.forall(c => c == 'N' || c == '-')
         } || !r.get[String](rawTag).exists(_.contains('N')))
           || { filteredNsInUmi += 1; false })
diff --git a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala
index 81412a859..3e53378b9 100644
--- a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala
+++ b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala
@@ -94,7 +94,9 @@ object Sequences {
     */
   def gcContent(s: String): Double = if (s.isEmpty) 0 else SequenceUtil.calculateGc(s.getBytes)
 
-  /** Counts the number of mismatches between two sequences of the same length. */
+  /** Counts the number of mismatches between two sequences of the same length.
+    * N's always count as mismatches
+    */
   def countMismatches(s1: String, s2: String): Int = {
     require(s1.length == s2.length, s"Cannot count mismatches in strings of differing lengths: $s1 $s2")
 
@@ -102,7 +104,7 @@ object Sequences {
     forloop (from=0, until=s1.length) { i =>
       val a = Character.toUpperCase(s1.charAt(i))
       val b = Character.toUpperCase(s2.charAt(i))
-      if (a != b) count += 1
+      if (a != b || a  == 'N' || b == 'N') count += 1
     }
 
     count