diff --git a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv index edd067628..21012c9f9 100644 --- a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv +++ b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv @@ -174,6 +174,7 @@ kgP2O5/ha // kg/ha Mg ha-1 // Mg/ha mg / l // mg/l mg.l-1 // mg/l +mg l-1 // mg/l mg / kg // mg/kg mg.kg-1 // mg/kg mg kg-1 // mg/kg diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml index b3d740acd..b508665c9 100644 --- a/main/src/main/resources/org/clulab/numeric/atomic.yml +++ b/main/src/main/resources/org/clulab/numeric/atomic.yml @@ -14,7 +14,7 @@ rules: priority: ${ rulepriority } type: token pattern: | - [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* + [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /CD/]) # possible years, from 1ddd to 20dd - name: year diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml index 2939726cd..f8f67942d 100644 --- a/main/src/main/resources/org/clulab/numeric/date-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/date-ranges.yml @@ -184,3 +184,12 @@ action: mkDateRangeMentionVagueSeason pattern: | /^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (? [word = /^(WS|DS)$/]) + +- name: date-one-token-year-range + label: Date + priority: ${ rulepriority } + type: token + example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]" + action: mkDateRangeMentionOneTokenYearRange + pattern: | + /^(1\d\d\d|20\d\d)[\-\/](1\d\d\d|20\d\d)$/ \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml index 0256755e5..081ccc220 100644 --- a/main/src/main/resources/org/clulab/numeric/dates.yml +++ b/main/src/main/resources/org/clulab/numeric/dates.yml @@ -47,7 +47,7 @@ rules: example: "It was May 12 of 2000" action: mkDateMention pattern: | - @month:PossibleMonth ( @day:PossibleDay )? "of"? @year:PossibleYear + @month:PossibleMonth ( @day:PossibleDay )? /^(of|in)$/? @year:PossibleYear # American date format, with mandatory year - name: date-4 @@ -76,7 +76,7 @@ rules: example: "26 September in 2011WS" action: mkDateMention pattern: | - ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (? [word = /^(\d\d\d\d)(WS|DS)$/]) + ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (? [word = /^(1\d\d\d|20\d\d)(WS|DS)$/]) # Rule for YYYY-MM-DD (accepts -, :, / as separators) - name: date-yyyy-mm-dd diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml index 9a6852bea..4f9316577 100644 --- a/main/src/main/resources/org/clulab/numeric/measurements.yml +++ b/main/src/main/resources/org/clulab/numeric/measurements.yml @@ -1,4 +1,5 @@ rules: + - name: measurement-1 label: Measurement priority: ${ rulepriority } @@ -15,10 +16,18 @@ rules: pattern: | @number:NumberRange @unit:MeasurementUnit + - name: measurement-3 + label: Measurement + priority: ${ rulepriority } + type: token + action: mkSharedMeasurementMention + pattern: | + (@number:Number [!tag = /^NN|LRB/]{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit + - name: measurement-percentage label: Percentage priority: ${ rulepriority } type: token action: mkPercentage pattern: | - @number:Number [word=/(?i)pct|percent|%/] + @number:Number [word=/(?i)pct|percent|%/] \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/numeric/number-ranges.yml b/main/src/main/resources/org/clulab/numeric/number-ranges.yml index f7156c3e3..13117ca4d 100644 --- a/main/src/main/resources/org/clulab/numeric/number-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/number-ranges.yml @@ -9,7 +9,7 @@ example: "Weeding timing ranged from 2 to 17 days" action: mkNumberRangeMention pattern: | - /(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number + /(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number - name: number-range-2 priority: ${rulepriority} diff --git a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index 96fede26a..fd9fbd6fc 100644 --- a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -42,7 +42,6 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions def extractFrom(doc:Document): Seq[Mention] = { // dictionaries val originalEntities = matchLexiconNer(doc) - // grammars var mentions = extractor.extractFrom(doc) @@ -52,9 +51,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions } // global actions *after* all grammars are done - mentions = actions.cleanupAction(mentions) - - mentions + actions.cleanupAction(mentions) } } diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index 143d410e1..52b896934 100644 --- a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { val convertedMentions = new ArrayBuffer[Mention]() for(m <- mentions) { try { - convertedMentions += converter(m) + convertedMentions += converter(m ) + } catch { + case e: Exception => + // sometimes these conversions fail, mainly on broken texts + // let's be robust here: report the error and move on + System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...") + e.printStackTrace() + } + } + convertedMentions + } + + /** Converts a sequence of mentions to new types given the converter function */ + private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = { + val convertedMentions = new ArrayBuffer[Mention]() + for(m <- mentions) { + try { + convertedMentions ++= converter(m ) } catch { case e: Exception => // sometimes these conversions fail, mainly on broken texts @@ -38,6 +55,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { convert(mentions, toMeasurementMention, "toMeasurementMention") } + /** Constructs a MeasurementMention from a token pattern */ + def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = { + convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") + } + def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = { convert(mentions, toPercentageMention, "toPercentageMention") } @@ -120,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason") } + /** Constructs a DateRangeMention from a token pattern */ + def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = { + convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange") + } + /** Constructs a DateMention from a token pattern */ def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = { convert(mentions, toDateMention, "toDateMention") diff --git a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala index f93d75b63..fab60eceb 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala @@ -30,7 +30,6 @@ class MeasurementMention ( labels: Seq[String], if(numValueOpt.isEmpty) throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!") val unitNorm = UnitNormalizer.norm(unit.get) - numValueOpt.get + " " + unitNorm } diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala index 3d4e89370..3a921165c 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/package.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/package.scala @@ -1,6 +1,8 @@ package org.clulab.numeric -import org.clulab.odin.{Mention, RelationMention, TextBoundMention} +import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention} +import org.clulab.struct.Interval + import java.util.regex.Pattern package object mentions { @@ -68,6 +70,36 @@ package object mentions { throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!") } + def toSharedMeasurementMention(mention: Mention): Seq[Mention] = mention match { + case m: MeasurementMention => Seq(m) + + case m: RelationMention => + mention.arguments("number").sortBy(_.tokenInterval).map { a => + val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval) + // if num and unit are adjacent, include both in new token int, else use the token int of the number arg + val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) { + Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end) + } else a.tokenInterval + new MeasurementMention( + m.labels, + a.tokenInterval, + m.sentence, + m.document, + m.keep, + m.foundBy, + m.attachments, + Some(a.words), + getArgWords("unit", m), + false + ) + } + + + case m => + throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!") + } + + def toPercentageMention(mention: Mention): PercentageMention = mention match { case m: PercentageMention => m @@ -470,6 +502,22 @@ package object mentions { throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!") } + /** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */ + def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention = mention match { + case m: DateRangeMention => m + + case m: TextBoundMention => + val years = m.text.split("[-\\/]") + DateRangeMention( + m, + TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))), + TempEvalFormatter.mkDate(None, None, Some(Seq(years.last))) + ) + + case m => + throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!") + } + def toDateMention(mention: Mention): DateMention = mention match { case m: DateMention => m diff --git a/main/src/main/scala/org/clulab/numeric/package.scala b/main/src/main/scala/org/clulab/numeric/package.scala index 016e27939..a5845342d 100644 --- a/main/src/main/scala/org/clulab/numeric/package.scala +++ b/main/src/main/scala/org/clulab/numeric/package.scala @@ -5,6 +5,7 @@ import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMe import org.clulab.odin.{EventMention, Mention} import org.clulab.processors.{Document, Sentence} import org.clulab.struct.Interval +import scala.util.control.Breaks._ package object numeric { def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { @@ -92,6 +93,29 @@ package object numeric { addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval) } } + removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT") + } + + def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = { + // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' + // toBeRemovedShortened is entity without BIO- + for(s <- doc.sentences) { + val zippedEntities = s.entities.get.zipWithIndex + for ((e, i) <- zippedEntities) { + if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) { + s.entities.get(i - 1) = "O" + // go in reverse replacing indices and norms in the immediate preceding mention + breakable { + for ((en, j) <- zippedEntities.slice(0, i ).reverse) { + if (en.endsWith(toBeRemovedShortened)) { + s.entities.get(j) = "O" + s.norms.get(j) = "" + } else break + } + } + } + } + } } private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = { diff --git a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala index d580f846e..15e99bc77 100644 --- a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala +++ b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala @@ -667,6 +667,7 @@ class CluProcessor protected ( // numeric entities using our Odin rules // val numericMentions = numericEntityRecognizer.extractFrom(doc) + setLabelsAndNorms(doc, numericMentions) } @@ -772,7 +773,6 @@ class CluProcessor protected ( for(sent <- doc.sentences) { val headsWithLabels = parseSentenceWithEisner(sent.words, sent.tags.get, sent.entities.get, embeddings) parserPostProcessing(sent, headsWithLabels) - //println("headsWithLabels: " + headsWithLabels.mkString(" ")) val edges = new ListBuffer[Edge[String]]() val roots = new mutable.HashSet[Int]() diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 09ad4a02e..bc3151102 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX") } + it should "recognize one token year ranges" in { + ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX") + ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX") + } + it should "recognize numeric dates of form month of year" in { ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX") ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX") @@ -170,6 +175,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence= "February 21 of 1002", Interval(0, 4), goldEntity= "DATE", goldNorm= "1002-02-21") } + it should "recognize numeric dates of form month date in year" in { + ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(6, 10), "DATE", "2010-07-01") + ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(12, 16), "DATE", "2011-07-08") + } + it should "recognize dates with ordinal days" in { ensure(sentence = "Planting dates are between July 1st and August 2nd.", Interval(3, 9), goldEntity = "DATE-RANGE", "XXXX-07-01 -- XXXX-08-02") } @@ -322,6 +332,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { it should "recognize date ranges with vague seasons" in { ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.", Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26") + ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", + Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13" + ) + ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", + Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25" + ) } it should "recognize date ranges (month/day) with vague seasons" in { @@ -438,6 +454,23 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { // TODO: not sure what should be the output of such measurement '3 or 4 days' ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d") + ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha") + ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha") + } + + it should "recognize shared units" in { + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha") + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha") + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha") + ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha") + ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha") + ensure(sentence = "+ 100 kg ha-1 urea at 20 das + 50 kg ha-1 urea at 50 das", Interval(6, 8), goldEntity="O", goldNorm="") + ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha") + } + + it should "not recognize preposition `in` as `inch`" in { + ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="") + ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="") } // TODO: this requires non trivial changes to the tokenizer @@ -562,16 +595,21 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { println("Entities: " + entities.mkString(", ")) println("Norms: " + norms.mkString(", ")) - if(goldEntity.nonEmpty) { + if (goldEntity.nonEmpty) { var first = true for (i <- span.indices) { - val prefix = if (first) "B-" else "I-" - val label = prefix + goldEntity + if (goldEntity == "O") { + norms(i) should be(goldNorm) + } else { + val prefix = if (first) "B-" else "I-" + val label = prefix + goldEntity + + entities(i) should be(label) + norms(i) should be(goldNorm) - entities(i) should be(label) - norms(i) should be(goldNorm) + first = false + } - first = false } } }