Merge pull request #649 from clulab/masha-norms

more norms
clulab · Jul 18, 2022 · 23799f6 · 23799f6
2 parents 8850190 + 9b7335c
commit 23799f6
Show file tree

Hide file tree

Showing 13 changed files with 171 additions and 19 deletions.
diff --git a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv
@@ -174,6 +174,7 @@ kgP2O5/ha                  // kg/ha
 Mg ha-1                    // Mg/ha
 mg / l                     // mg/l
 mg.l-1                     // mg/l
+mg l-1                     // mg/l
 mg / kg                    // mg/kg
 mg.kg-1                    // mg/kg
 mg kg-1                    // mg/kg

diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml
@@ -14,7 +14,7 @@ rules:
     priority: ${ rulepriority }
     type: token
     pattern: |
-      [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]*
+      [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /CD/])
 
   # possible years, from 1ddd to 20dd
   - name: year

diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml
@@ -184,3 +184,12 @@
   action: mkDateRangeMentionVagueSeason
   pattern: |
     /^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (?<season> [word = /^(WS|DS)$/])
+
+- name: date-one-token-year-range
+  label: Date
+  priority: ${ rulepriority }
+  type: token
+  example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]"
+  action: mkDateRangeMentionOneTokenYearRange
+  pattern: |
+    /^(1\d\d\d|20\d\d)[\-\/](1\d\d\d|20\d\d)$/
diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml
@@ -47,7 +47,7 @@ rules:
     example: "It was May 12 of 2000"
     action: mkDateMention
     pattern: |
-      @month:PossibleMonth ( @day:PossibleDay )? "of"? @year:PossibleYear
+      @month:PossibleMonth ( @day:PossibleDay )? /^(of|in)$/? @year:PossibleYear
 
   # American date format, with mandatory year
   - name: date-4
@@ -76,7 +76,7 @@ rules:
     example: "26 September in 2011WS"
     action: mkDateMention
     pattern: |
-      ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (?<year> [word = /^(\d\d\d\d)(WS|DS)$/])
+      ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (?<year> [word = /^(1\d\d\d|20\d\d)(WS|DS)$/])
 
   # Rule for YYYY-MM-DD (accepts -, :, / as separators)
   - name: date-yyyy-mm-dd

diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml
@@ -1,4 +1,5 @@
 rules:
+
   - name: measurement-1
     label: Measurement
     priority: ${ rulepriority }
@@ -15,10 +16,18 @@ rules:
     pattern: |
       @number:NumberRange @unit:MeasurementUnit
 
+  - name: measurement-3
+    label: Measurement
+    priority: ${ rulepriority }
+    type: token
+    action: mkSharedMeasurementMention
+    pattern: |
+      (@number:Number [!tag = /^NN|LRB/]{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit
+
   - name: measurement-percentage
     label: Percentage
     priority: ${ rulepriority }
     type: token
     action: mkPercentage
     pattern: |
-      @number:Number [word=/(?i)pct|percent|%/]
+      @number:Number [word=/(?i)pct|percent|%/]
diff --git a/main/src/main/resources/org/clulab/numeric/number-ranges.yml b/main/src/main/resources/org/clulab/numeric/number-ranges.yml
@@ -9,7 +9,7 @@
   example: "Weeding timing ranged from 2 to 17 days"
   action: mkNumberRangeMention
   pattern: |
-    /(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number
+    /(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number 
 
 - name: number-range-2
   priority: ${rulepriority}

diff --git a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala
@@ -42,7 +42,6 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
   def extractFrom(doc:Document): Seq[Mention] = {
     // dictionaries
     val originalEntities = matchLexiconNer(doc)
-
     // grammars
     var mentions = extractor.extractFrom(doc)
 
@@ -52,9 +51,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
     }
 
     // global actions *after* all grammars are done
-    mentions = actions.cleanupAction(mentions)
-
-    mentions
+    actions.cleanupAction(mentions)
   }
 }
 

diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala
@@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     val convertedMentions = new ArrayBuffer[Mention]()
     for(m <- mentions) {
       try {
-        convertedMentions += converter(m)
+        convertedMentions += converter(m )
+      } catch {
+        case e: Exception =>
+          // sometimes these conversions fail, mainly on broken texts
+          // let's be robust here: report the error and move on
+          System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...")
+          e.printStackTrace()
+      }
+    }
+    convertedMentions
+  }
+
+  /** Converts a sequence of mentions to new types given the converter function */
+  private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = {
+    val convertedMentions = new ArrayBuffer[Mention]()
+    for(m <- mentions) {
+      try {
+        convertedMentions ++= converter(m )
       } catch {
         case e: Exception =>
           // sometimes these conversions fail, mainly on broken texts
@@ -38,6 +55,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     convert(mentions, toMeasurementMention, "toMeasurementMention")
   }
 
+  /** Constructs a MeasurementMention from a token pattern */
+  def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
+    convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention")
+  }
+
   def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = {
     convert(mentions, toPercentageMention, "toPercentageMention")
   }
@@ -120,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
     convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason")
   }
 
+  /** Constructs a DateRangeMention from a token pattern */
+  def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = {
+    convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange")
+  }
+
   /** Constructs a DateMention from a token pattern */
   def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
     convert(mentions, toDateMention, "toDateMention")

diff --git a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala
@@ -30,7 +30,6 @@ class MeasurementMention ( labels: Seq[String],
     if(numValueOpt.isEmpty)
       throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!")
     val unitNorm = UnitNormalizer.norm(unit.get)
-
     numValueOpt.get + " " + unitNorm
   }
 

diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala
@@ -1,6 +1,8 @@
 package org.clulab.numeric
 
-import org.clulab.odin.{Mention, RelationMention, TextBoundMention}
+import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention}
+import org.clulab.struct.Interval
+
 import java.util.regex.Pattern
 
 package object mentions {
@@ -68,6 +70,36 @@ package object mentions {
       throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
   }
 
+  def toSharedMeasurementMention(mention: Mention): Seq[Mention] =  mention match {
+    case m:  MeasurementMention => Seq(m)
+
+    case m: RelationMention =>
+      mention.arguments("number").sortBy(_.tokenInterval).map { a =>
+        val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval)
+        // if num and unit are adjacent, include both in new token int, else use the token int of the number arg
+        val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) {
+          Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end)
+        } else a.tokenInterval
+        new MeasurementMention(
+          m.labels,
+          a.tokenInterval,
+          m.sentence,
+          m.document,
+          m.keep,
+          m.foundBy,
+          m.attachments,
+          Some(a.words),
+          getArgWords("unit", m),
+          false
+        )
+      }
+
+
+    case m =>
+      throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
+  }
+
+
   def toPercentageMention(mention: Mention): PercentageMention =  mention match {
     case m:  PercentageMention => m
 
@@ -470,6 +502,22 @@ package object mentions {
       throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
   }
 
+  /** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */
+  def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention =  mention match {
+    case m: DateRangeMention => m
+
+    case m: TextBoundMention =>
+      val years = m.text.split("[-\\/]")
+      DateRangeMention(
+        m,
+        TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))),
+        TempEvalFormatter.mkDate(None, None, Some(Seq(years.last)))
+      )
+
+    case m =>
+      throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
+  }
+
   def toDateMention(mention: Mention): DateMention =  mention match {
     case m: DateMention => m
 

diff --git a/main/src/main/scala/org/clulab/numeric/package.scala b/main/src/main/scala/org/clulab/numeric/package.scala
@@ -5,6 +5,7 @@ import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMe
 import org.clulab.odin.{EventMention, Mention}
 import org.clulab.processors.{Document, Sentence}
 import org.clulab.struct.Interval
+import scala.util.control.Breaks._
 
 package object numeric {
   def displayMentions(mentions: Seq[Mention], doc: Document): Unit = {
@@ -92,6 +93,29 @@ package object numeric {
         addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval)
       }
     }
+    removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT")
+  }
+
+  def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = {
+    // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal'
+    // toBeRemovedShortened is entity without BIO-
+    for(s <- doc.sentences) {
+      val zippedEntities = s.entities.get.zipWithIndex
+      for ((e, i) <- zippedEntities) {
+        if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) {
+          s.entities.get(i - 1) = "O"
+          // go in reverse replacing indices and norms in the immediate preceding mention
+          breakable {
+            for ((en, j) <- zippedEntities.slice(0, i ).reverse) {
+              if (en.endsWith(toBeRemovedShortened)) {
+                s.entities.get(j) = "O"
+                s.norms.get(j) = ""
+              } else break
+            }
+          }
+        }
+      }
+    }
   }
 
   private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = {

diff --git a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala
@@ -667,6 +667,7 @@ class CluProcessor protected (
     // numeric entities using our Odin rules
     //
     val numericMentions = numericEntityRecognizer.extractFrom(doc)
+
     setLabelsAndNorms(doc, numericMentions)
   }
 
@@ -772,7 +773,6 @@ class CluProcessor protected (
     for(sent <- doc.sentences) {
       val headsWithLabels = parseSentenceWithEisner(sent.words, sent.tags.get, sent.entities.get, embeddings)
       parserPostProcessing(sent, headsWithLabels)
-      //println("headsWithLabels: " + headsWithLabels.mkString(" "))
 
       val edges = new ListBuffer[Edge[String]]()
       val roots = new mutable.HashSet[Int]()

diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala
@@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
     ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX")
   }
 
+  it should "recognize one token year ranges" in {
+    ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX")
+    ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX")
+  }
+
   it should "recognize numeric dates of form month of year" in {
     ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX")
     ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX")
@@ -170,6 +175,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
     ensure(sentence= "February 21 of 1002", Interval(0, 4), goldEntity= "DATE", goldNorm= "1002-02-21")
   }
 
+  it should "recognize numeric dates of form month date in year" in {
+    ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(6, 10), "DATE", "2010-07-01")
+    ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(12, 16), "DATE", "2011-07-08")
+  }
+
   it should "recognize dates with ordinal days" in {
     ensure(sentence = "Planting dates are between July 1st and August 2nd.", Interval(3, 9), goldEntity = "DATE-RANGE", "XXXX-07-01 -- XXXX-08-02")
   }
@@ -322,6 +332,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
   it should "recognize date ranges with vague seasons" in {
     ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.",
       Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26")
+    ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
+      Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13"
+    )
+    ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
+      Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25"
+    )
   }
 
   it should "recognize date ranges (month/day) with vague seasons" in {
@@ -438,6 +454,23 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
 
     // TODO: not sure what should be the output of such measurement '3 or 4 days'
     ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d")
+    ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha")
+    ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha")
+  }
+
+  it should "recognize shared units" in {
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha")
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha")
+    ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha")
+    ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha")
+    ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha")
+    ensure(sentence = "+ 100 kg ha-1 urea at 20 das + 50 kg ha-1 urea at 50 das", Interval(6, 8), goldEntity="O", goldNorm="")
+    ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha")
+  }
+
+  it should "not recognize preposition `in` as `inch`" in {
+    ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="")
+    ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="")
   }
 
   // TODO: this requires non trivial changes to the tokenizer
@@ -562,16 +595,21 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
     println("Entities: " + entities.mkString(", "))
     println("Norms:    " + norms.mkString(", "))
 
-    if(goldEntity.nonEmpty) {
+    if (goldEntity.nonEmpty) {
       var first = true
       for (i <- span.indices) {
-        val prefix = if (first) "B-" else "I-"
-        val label = prefix + goldEntity
+        if (goldEntity == "O") {
+          norms(i) should be(goldNorm)
+        } else {
+          val prefix = if (first) "B-" else "I-"
+          val label = prefix + goldEntity
+
+          entities(i) should be(label)
+          norms(i) should be(goldNorm)
 
-        entities(i) should be(label)
-        norms(i) should be(goldNorm)
+          first = false
+        }
 
-        first = false
       }
     }
   }