clulab · kwalcock · Jul 18, 2022 · Jun 20, 2022 · Jun 20, 2022 · Jun 21, 2022
diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml
@@ -14,7 +14,7 @@ rules:
     priority: ${ rulepriority }
     type: token
     pattern: |
-      [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]*
+      [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /NNP|CD/])
 
   # possible years, from 1ddd to 20dd
   - name: year

diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml
@@ -184,3 +184,12 @@
   action: mkDateRangeMentionVagueSeason
   pattern: |
     /^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (?<season> [word = /^(WS|DS)$/])
+
+- name: date-one-token-year-range
+  label: Date
+  priority: ${ rulepriority }
+  type: token
+  example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]"
+  action: mkDateRangeMentionOneTokenYearRange
+  pattern: |
+    /^([12]\d\d\d)[\-\/]([12]\d\d\d)$/
diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml
@@ -76,7 +76,7 @@ rules:
     example: "26 September in 2011WS"
     action: mkDateMention
     pattern: |
-      ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (?<year> [word = /^(\d\d\d\d)(WS|DS)$/])
+      ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (?<year> [word = /^(\d\d\d\d)(WS|DS)$/])
 
   # Rule for YYYY-MM-DD (accepts -, :, / as separators)
   - name: date-yyyy-mm-dd

diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml
@@ -1,4 +1,5 @@
 rules:
+
   - name: measurement-1
     label: Measurement
     priority: ${ rulepriority }
@@ -15,10 +16,18 @@ rules:
     pattern: |
       @number:NumberRange @unit:MeasurementUnit
 
+  - name: measurement-3
+    label: Measurement
+    priority: ${ rulepriority }
+    type: token
+    action: mkSharedMeasurementMention
+    pattern: |
+      (@number:Number []{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit
+
   - name: measurement-percentage
     label: Percentage
     priority: ${ rulepriority }
     type: token
     action: mkPercentage
     pattern: |
-      @number:Number [word=/(?i)pct|percent|%/]
+      @number:Number [word=/(?i)pct|percent|%/]
diff --git a/main/src/main/resources/org/clulab/numeric/number-ranges.yml b/main/src/main/resources/org/clulab/numeric/number-ranges.yml
@@ -9,7 +9,7 @@
   example: "Weeding timing ranged from 2 to 17 days"
   action: mkNumberRangeMention
   pattern: |
-    /(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number
+    /(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number 
 
 - name: number-range-2
   priority: ${rulepriority}

diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala
@@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     val convertedMentions = new ArrayBuffer[Mention]()
     for(m <- mentions) {
       try {
-        convertedMentions += converter(m)
+        convertedMentions += converter(m )
+      } catch {
+        case e: Exception =>
+          // sometimes these conversions fail, mainly on broken texts
+          // let's be robust here: report the error and move on
+          System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...")
+          e.printStackTrace()
+      }
+    }
+    convertedMentions
+  }
+
+  /** Converts a sequence of mentions to new types given the converter function */
+  private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = {
+    val convertedMentions = new ArrayBuffer[Mention]()
+    for(m <- mentions) {
+      try {
+        convertedMentions ++= converter(m )
       } catch {
         case e: Exception =>
           // sometimes these conversions fail, mainly on broken texts
@@ -38,6 +55,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     convert(mentions, toMeasurementMention, "toMeasurementMention")
   }
 
+  /** Constructs a MeasurementMention from a token pattern */
+  def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
+    convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention")
+  }
+
   def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = {
     convert(mentions, toPercentageMention, "toPercentageMention")
   }
@@ -120,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason")
   }
 
+  /** Constructs a DateRangeMention from a token pattern */
+  def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = {
+    convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange")
+  }
+
   /** Constructs a DateMention from a token pattern */
   def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
     convert(mentions, toDateMention, "toDateMention")

diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala
@@ -1,6 +1,8 @@
 package org.clulab.numeric
 
-import org.clulab.odin.{Mention, RelationMention, TextBoundMention}
+import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention}
+import org.clulab.struct.Interval
+
 import java.util.regex.Pattern
 
 package object mentions {
@@ -68,6 +70,36 @@ package object mentions {
       throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
   }
 
+  def toSharedMeasurementMention(mention: Mention): Seq[Mention] =  mention match {
+    case m:  MeasurementMention => Seq(m)
+
+    case m: RelationMention =>
+      mention.arguments("number").sortBy(_.tokenInterval).map { a =>
+        val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval)
+        // if num and unit are adjacent, include both in new token int, else use the token int of the number arg
+        val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) {
+          Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end)
+        } else a.tokenInterval
+        new MeasurementMention(
+          m.labels,
+          a.tokenInterval,
+          m.sentence,
+          m.document,
+          m.keep,
+          m.foundBy,
+          m.attachments,
+          Some(a.words),
+          getArgWords("unit", m),
+          false
+        )
+      }
+
+
+    case m =>
+      throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
+  }
+
+
   def toPercentageMention(mention: Mention): PercentageMention =  mention match {
     case m:  PercentageMention => m
 
@@ -470,6 +502,22 @@ package object mentions {
       throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
   }
 
+  /** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */
+  def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention =  mention match {
+    case m: DateRangeMention => m
+
+    case m: TextBoundMention =>
+      val years = m.text.split("[-\\/]")
+      DateRangeMention(
+        m,
+        TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))),
+        TempEvalFormatter.mkDate(None, None, Some(Seq(years.last)))
+      )
+
+    case m =>
+      throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
+  }
+
   def toDateMention(mention: Mention): DateMention =  mention match {
     case m: DateMention => m
 

diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala
@@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
     ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX")
   }
 
+  it should "recognize one token year ranges" in {
+    ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX")
+    ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX")
+  }
+
   it should "recognize numeric dates of form month of year" in {
     ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX")
     ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX")
@@ -322,6 +327,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
   it should "recognize date ranges with vague seasons" in {
     ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.",
       Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26")
+    ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
+      Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13",
+    )
+    ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
+      Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25",
+    )
   }
 
   it should "recognize date ranges (month/day) with vague seasons" in {
@@ -438,6 +449,22 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
 
     // TODO: not sure what should be the output of such measurement '3 or 4 days'
     ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d")
+    ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha")
+    ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha")
+  }
+
+  it should "recognize shared units" in {
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha")
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha")
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha")
+    ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha")
+    ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha")
+    ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha")
+  }
+
+  it should "not recognize preposition `in` as `inch`" in {
+    ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="")
+    ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="")
   }
 
   // TODO: this requires non trivial changes to the tokenizer
@@ -565,13 +592,18 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
     if(goldEntity.nonEmpty) {
       var first = true
       for (i <- span.indices) {
-        val prefix = if (first) "B-" else "I-"
-        val label = prefix + goldEntity
+        if (goldEntity == "O") {
+          norms(i) should be(goldNorm)
+        } else {
+          val prefix = if (first) "B-" else "I-"
+          val label = prefix + goldEntity
+
+          entities(i) should be(label)
+          norms(i) should be(goldNorm)
 
-        entities(i) should be(label)
-        norms(i) should be(goldNorm)
+          first = false
+        }
 
-        first = false
       }
     }
   }