From 2a1767bfea584127ed1f9c0f8bbb4529fe34217b Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Mon, 20 Jun 2022 13:53:57 -0700 Subject: [PATCH 01/11] saving intermediate progress on more numeric normalization --- .../resources/org/clulab/numeric/atomic.yml | 2 +- .../resources/org/clulab/numeric/dates.yml | 2 +- .../org/clulab/numeric/measurements.yml | 11 +++++- .../org/clulab/numeric/number-ranges.yml | 2 +- .../numeric/actions/NumericActions.scala | 26 ++++++++++++- .../org/clulab/numeric/mentions/package.scala | 38 ++++++++++++++++++- .../org/clulab/odin/ExtractorEngine.scala | 1 + .../TestNumericEntityRecognition.scala | 6 +++ 8 files changed, 82 insertions(+), 6 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml index b3d740acd..3f80c95c1 100644 --- a/main/src/main/resources/org/clulab/numeric/atomic.yml +++ b/main/src/main/resources/org/clulab/numeric/atomic.yml @@ -14,7 +14,7 @@ rules: priority: ${ rulepriority } type: token pattern: | - [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* + [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /NNP|CD/]) # possible years, from 1ddd to 20dd - name: year diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml index 0256755e5..5693232dd 100644 --- a/main/src/main/resources/org/clulab/numeric/dates.yml +++ b/main/src/main/resources/org/clulab/numeric/dates.yml @@ -76,7 +76,7 @@ rules: example: "26 September in 2011WS" action: mkDateMention pattern: | - ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (? [word = /^(\d\d\d\d)(WS|DS)$/]) + ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (? [word = /^(\d\d\d\d)(WS|DS)$/]) # Rule for YYYY-MM-DD (accepts -, :, / as separators) - name: date-yyyy-mm-dd diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml index 9a6852bea..f6fc374f2 100644 --- a/main/src/main/resources/org/clulab/numeric/measurements.yml +++ b/main/src/main/resources/org/clulab/numeric/measurements.yml @@ -1,11 +1,12 @@ rules: + - name: measurement-1 label: Measurement priority: ${ rulepriority } type: token action: mkMeasurementMention pattern: | - @number:Number @unit:MeasurementUnit + @number:Number @unit:MeasurementUnit - name: measurement-2 label: Measurement @@ -15,6 +16,14 @@ rules: pattern: | @number:NumberRange @unit:MeasurementUnit + - name: measurement-3 + label: Measurement + priority: ${ rulepriority } + type: token + action: mkSharedMeasurementMention + pattern: | + (@number:Number [word = /,|and/]*)+ @number:Number @unit:MeasurementUnit + - name: measurement-percentage label: Percentage priority: ${ rulepriority } diff --git a/main/src/main/resources/org/clulab/numeric/number-ranges.yml b/main/src/main/resources/org/clulab/numeric/number-ranges.yml index f7156c3e3..13117ca4d 100644 --- a/main/src/main/resources/org/clulab/numeric/number-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/number-ranges.yml @@ -9,7 +9,7 @@ example: "Weeding timing ranged from 2 to 17 days" action: mkNumberRangeMention pattern: | - /(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number + /(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number - name: number-range-2 priority: ${rulepriority} diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index 143d410e1..fefa64f94 100644 --- a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { val convertedMentions = new ArrayBuffer[Mention]() for(m <- mentions) { try { - convertedMentions += converter(m) + convertedMentions += converter(m ) + } catch { + case e: Exception => + // sometimes these conversions fail, mainly on broken texts + // let's be robust here: report the error and move on + System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...") + e.printStackTrace() + } + } + convertedMentions + } + + /** Converts a sequence of mentions to new types given the converter function */ + private def convertOneToMany(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = { + val convertedMentions = new ArrayBuffer[Mention]() + for(m <- mentions) { + try { + convertedMentions ++= converter(m ) } catch { case e: Exception => // sometimes these conversions fail, mainly on broken texts @@ -38,6 +55,13 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { convert(mentions, toMeasurementMention, "toMeasurementMention") } + /** Constructs a MeasurementMention from a token pattern */ + def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = { + val mens = convertOneToMany(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") + mens.foreach(m => println("MEN: " + m.text)) + mens + } + def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = { convert(mentions, toPercentageMention, "toPercentageMention") } diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala index 3d4e89370..d8ec9742d 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/package.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/package.scala @@ -1,6 +1,8 @@ package org.clulab.numeric -import org.clulab.odin.{Mention, RelationMention, TextBoundMention} +import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention} +import org.clulab.struct.Interval + import java.util.regex.Pattern package object mentions { @@ -68,6 +70,40 @@ package object mentions { throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!") } + def toSharedMeasurementMention(mention: Mention): Seq[Mention] = mention match { + case m: MeasurementMention => Seq(m) + + case m: RelationMention => +// if (mention.arguments("number").length > 1) +// println("M: " + mention.text + " " + mention.label) + mention.arguments("number").sortBy(_.tokenInterval).map { a => +// println("num: " + a.words.mkString("|")) +// println("unit: " + m.arguments("unit").head.text) +// println("unit: " + getArgWords("unit", m)) + val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval) + val newTokInt = Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end) +// println(">>" + newArgs.map(_.text).mkString("::")) +// println(">>>" + newTokInt + "\n") + new MeasurementMention( + m.labels, + a.tokenInterval, + m.sentence, + m.document, + m.keep, + m.foundBy, + m.attachments, + Some(a.words), + getArgWords("unit", m), + false + ) + } + + + case m => + throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!") + } + + def toPercentageMention(mention: Mention): PercentageMention = mention match { case m: PercentageMention => m diff --git a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala index cb687b2cd..26bbdd941 100644 --- a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala +++ b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala @@ -49,6 +49,7 @@ class ExtractorEngine(val extractors: Vector[Extractor], val globalAction: Actio mention <- globalAction(extractedMentions, state) if mention.isValid && !state.contains(mention) } yield mention +// for (m <- finalMentions) println("men: " + m.label + " " + m.text + " " + m.foundBy + " " + m.arguments.flatMap(_._2).toList.length) // return the final mentions finalMentions } diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index ccdc09466..18ed1d9ab 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -440,6 +440,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d") } + it should "recognize shared units" in { + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha") + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha") + ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha") + } + // TODO: this requires non trivial changes to the tokenizer /* // tests for recognizing units which are sticked to values From 66ee8a45d6841282dc1f0333bb91803e14e53d6e Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Mon, 20 Jun 2022 16:15:55 -0700 Subject: [PATCH 02/11] more normalization + tests --- .../org/clulab/numeric/date-ranges.yml | 9 ++++++++ .../org/clulab/numeric/measurements.yml | 3 ++- .../numeric/actions/NumericActions.scala | 9 +++++--- .../org/clulab/numeric/mentions/package.scala | 16 ++++++++++++++ .../TestNumericEntityRecognition.scala | 21 +++++++++++++++++++ 5 files changed, 54 insertions(+), 4 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml index 2939726cd..eb6fd8351 100644 --- a/main/src/main/resources/org/clulab/numeric/date-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/date-ranges.yml @@ -184,3 +184,12 @@ action: mkDateRangeMentionVagueSeason pattern: | /^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (? [word = /^(WS|DS)$/]) + +- name: date-one-token-year-range + label: Date + priority: ${ rulepriority } + type: token + example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]" + action: mkDateRangeMentionOneTokenYearRange + pattern: | + /^([12]\d\d\d)[\-\/]([12]\d\d\d)$/ \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml index f6fc374f2..e9960587d 100644 --- a/main/src/main/resources/org/clulab/numeric/measurements.yml +++ b/main/src/main/resources/org/clulab/numeric/measurements.yml @@ -22,7 +22,7 @@ rules: type: token action: mkSharedMeasurementMention pattern: | - (@number:Number [word = /,|and/]*)+ @number:Number @unit:MeasurementUnit + (@number:Number []{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit - name: measurement-percentage label: Percentage @@ -31,3 +31,4 @@ rules: action: mkPercentage pattern: | @number:Number [word=/(?i)pct|percent|%/] + diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index fefa64f94..f20b473ca 100644 --- a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -57,9 +57,7 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { /** Constructs a MeasurementMention from a token pattern */ def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = { - val mens = convertOneToMany(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") - mens.foreach(m => println("MEN: " + m.text)) - mens + convertOneToMany(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") } def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = { @@ -144,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason") } + /** Constructs a DateRangeMention from a token pattern */ + def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = { + convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange") + } + /** Constructs a DateMention from a token pattern */ def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = { convert(mentions, toDateMention, "toDateMention") diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala index d8ec9742d..e81ccc6ef 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/package.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/package.scala @@ -506,6 +506,22 @@ package object mentions { throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!") } + /** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */ + def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention = mention match { + case m: DateRangeMention => m + + case m: TextBoundMention => + val years = m.text.split("[-\\/]") + DateRangeMention( + m, + TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))), + TempEvalFormatter.mkDate(None, None, Some(Seq(years.last))) + ) + + case m => + throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!") + } + def toDateMention(mention: Mention): DateMention = mention match { case m: DateMention => m diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 18ed1d9ab..a3efb9c02 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX") } + it should "recognize one token year ranges" in { + ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX") + ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX") + } + it should "recognize numeric dates of form month of year" in { ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX") ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX") @@ -322,6 +327,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { it should "recognize date ranges with vague seasons" in { ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.", Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26") + ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", + Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13", + ) + ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", + Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25", + ) } it should "recognize date ranges (month/day) with vague seasons" in { @@ -438,12 +449,22 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { // TODO: not sure what should be the output of such measurement '3 or 4 days' ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d") + ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha") + ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha") } it should "recognize shared units" in { ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha") ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha") ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha") + ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha") + ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha") + ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha") + } + + it should "not recognize preposition `in` as `inch`" in { + ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="") + ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="") } // TODO: this requires non trivial changes to the tokenizer From 442abcb93e0ebbbfae6071bef9d43b81f12c87c5 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Mon, 20 Jun 2022 19:09:31 -0700 Subject: [PATCH 03/11] correcting mention tok interval + cleanup --- .../resources/org/clulab/numeric/measurements.yml | 5 ++--- .../clulab/numeric/actions/NumericActions.scala | 4 ++-- .../org/clulab/numeric/mentions/package.scala | 12 ++++-------- .../scala/org/clulab/odin/ExtractorEngine.scala | 1 - .../numeric/TestNumericEntityRecognition.scala | 15 ++++++++++----- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml index e9960587d..aa1d085b8 100644 --- a/main/src/main/resources/org/clulab/numeric/measurements.yml +++ b/main/src/main/resources/org/clulab/numeric/measurements.yml @@ -6,7 +6,7 @@ rules: type: token action: mkMeasurementMention pattern: | - @number:Number @unit:MeasurementUnit + @number:Number @unit:MeasurementUnit - name: measurement-2 label: Measurement @@ -30,5 +30,4 @@ rules: type: token action: mkPercentage pattern: | - @number:Number [word=/(?i)pct|percent|%/] - + @number:Number [word=/(?i)pct|percent|%/] \ No newline at end of file diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index f20b473ca..52b896934 100644 --- a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -29,7 +29,7 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { } /** Converts a sequence of mentions to new types given the converter function */ - private def convertOneToMany(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = { + private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = { val convertedMentions = new ArrayBuffer[Mention]() for(m <- mentions) { try { @@ -57,7 +57,7 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions { /** Constructs a MeasurementMention from a token pattern */ def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = { - convertOneToMany(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") + convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention") } def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = { diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala index e81ccc6ef..3a921165c 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/package.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/package.scala @@ -74,16 +74,12 @@ package object mentions { case m: MeasurementMention => Seq(m) case m: RelationMention => -// if (mention.arguments("number").length > 1) -// println("M: " + mention.text + " " + mention.label) mention.arguments("number").sortBy(_.tokenInterval).map { a => -// println("num: " + a.words.mkString("|")) -// println("unit: " + m.arguments("unit").head.text) -// println("unit: " + getArgWords("unit", m)) val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval) - val newTokInt = Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end) -// println(">>" + newArgs.map(_.text).mkString("::")) -// println(">>>" + newTokInt + "\n") + // if num and unit are adjacent, include both in new token int, else use the token int of the number arg + val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) { + Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end) + } else a.tokenInterval new MeasurementMention( m.labels, a.tokenInterval, diff --git a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala index 26bbdd941..cb687b2cd 100644 --- a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala +++ b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala @@ -49,7 +49,6 @@ class ExtractorEngine(val extractors: Vector[Extractor], val globalAction: Actio mention <- globalAction(extractedMentions, state) if mention.isValid && !state.contains(mention) } yield mention -// for (m <- finalMentions) println("men: " + m.label + " " + m.text + " " + m.foundBy + " " + m.arguments.flatMap(_._2).toList.length) // return the final mentions finalMentions } diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index a3efb9c02..f78fd7a33 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -592,13 +592,18 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { if(goldEntity.nonEmpty) { var first = true for (i <- span.indices) { - val prefix = if (first) "B-" else "I-" - val label = prefix + goldEntity + if (goldEntity == "O") { + norms(i) should be(goldNorm) + } else { + val prefix = if (first) "B-" else "I-" + val label = prefix + goldEntity - entities(i) should be(label) - norms(i) should be(goldNorm) + entities(i) should be(label) + norms(i) should be(goldNorm) + + first = false + } - first = false } } } From d1178f3f1101bfb89a4a92a86b29b3b9e40c3eb2 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Tue, 21 Jun 2022 08:34:48 -0700 Subject: [PATCH 04/11] removed wrong commas --- .../org/clulab/numeric/TestNumericEntityRecognition.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index f78fd7a33..ff7b032d7 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -328,10 +328,10 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.", Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26") ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", - Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13", + Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13" ) ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS", - Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25", + Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25" ) } @@ -589,7 +589,7 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { println("Entities: " + entities.mkString(", ")) println("Norms: " + norms.mkString(", ")) - if(goldEntity.nonEmpty) { + if (goldEntity.nonEmpty) { var first = true for (i <- span.indices) { if (goldEntity == "O") { From e7915a5fdcaa23f4c25624e186ece22c70d2e210 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Wed, 22 Jun 2022 08:27:03 -0700 Subject: [PATCH 05/11] added another way to write mg/l --- main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv index edd067628..21012c9f9 100644 --- a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv +++ b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv @@ -174,6 +174,7 @@ kgP2O5/ha // kg/ha Mg ha-1 // Mg/ha mg / l // mg/l mg.l-1 // mg/l +mg l-1 // mg/l mg / kg // mg/kg mg.kg-1 // mg/kg mg kg-1 // mg/kg From 7f5d111d2b4da9b6bb353fb0201d4b61e1bf2500 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Mon, 4 Jul 2022 14:37:55 -0700 Subject: [PATCH 06/11] added more numeric tests + small rule adjustments --- main/src/main/resources/org/clulab/numeric/dates.yml | 2 +- main/src/main/resources/org/clulab/numeric/measurements.yml | 2 +- .../scala/org/clulab/numeric/NumericEntityRecognizer.scala | 4 +--- .../org/clulab/numeric/TestNumericEntityRecognition.scala | 6 ++++++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml index 5693232dd..cefbf2734 100644 --- a/main/src/main/resources/org/clulab/numeric/dates.yml +++ b/main/src/main/resources/org/clulab/numeric/dates.yml @@ -47,7 +47,7 @@ rules: example: "It was May 12 of 2000" action: mkDateMention pattern: | - @month:PossibleMonth ( @day:PossibleDay )? "of"? @year:PossibleYear + @month:PossibleMonth ( @day:PossibleDay )? /^(of|in)$/? @year:PossibleYear # American date format, with mandatory year - name: date-4 diff --git a/main/src/main/resources/org/clulab/numeric/measurements.yml b/main/src/main/resources/org/clulab/numeric/measurements.yml index aa1d085b8..4f9316577 100644 --- a/main/src/main/resources/org/clulab/numeric/measurements.yml +++ b/main/src/main/resources/org/clulab/numeric/measurements.yml @@ -22,7 +22,7 @@ rules: type: token action: mkSharedMeasurementMention pattern: | - (@number:Number []{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit + (@number:Number [!tag = /^NN|LRB/]{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit - name: measurement-percentage label: Percentage diff --git a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index 96fede26a..ce912feaa 100644 --- a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -52,9 +52,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions } // global actions *after* all grammars are done - mentions = actions.cleanupAction(mentions) - - mentions + actions.cleanupAction(mentions) } } diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index ff7b032d7..ea1fb39a7 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -175,6 +175,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence= "February 21 of 1002", Interval(0, 4), goldEntity= "DATE", goldNorm= "1002-02-21") } + it should "recognize numeric dates of form month date in year" in { + ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(6, 10), "DATE", "2010-07-01") + ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(12, 16), "DATE", "2011-07-08") + } + it should "recognize dates with ordinal days" in { ensure(sentence = "Planting dates are between July 1st and August 2nd.", Interval(3, 9), goldEntity = "DATE-RANGE", "XXXX-07-01 -- XXXX-08-02") } @@ -459,6 +464,7 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers { ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha") ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha") ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha") + ensure(sentence = "+ 100 kg ha-1 urea at 20 das + 50 kg ha-1 urea at 50 das", Interval(6, 8), goldEntity="O", goldNorm="") ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha") } From a56e7a4fae740d3204d2b25b921ff85cdd5ae832 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Mon, 11 Jul 2022 07:39:43 -0700 Subject: [PATCH 07/11] more specific years to avoid capturing 2700-2800 as date --- main/src/main/resources/org/clulab/numeric/date-ranges.yml | 2 +- main/src/main/resources/org/clulab/numeric/dates.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml index eb6fd8351..f8f67942d 100644 --- a/main/src/main/resources/org/clulab/numeric/date-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/date-ranges.yml @@ -192,4 +192,4 @@ example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]" action: mkDateRangeMentionOneTokenYearRange pattern: | - /^([12]\d\d\d)[\-\/]([12]\d\d\d)$/ \ No newline at end of file + /^(1\d\d\d|20\d\d)[\-\/](1\d\d\d|20\d\d)$/ \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/numeric/dates.yml b/main/src/main/resources/org/clulab/numeric/dates.yml index cefbf2734..081ccc220 100644 --- a/main/src/main/resources/org/clulab/numeric/dates.yml +++ b/main/src/main/resources/org/clulab/numeric/dates.yml @@ -76,7 +76,7 @@ rules: example: "26 September in 2011WS" action: mkDateMention pattern: | - ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (? [word = /^(\d\d\d\d)(WS|DS)$/]) + ( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (? [word = /^(1\d\d\d|20\d\d)(WS|DS)$/]) # Rule for YYYY-MM-DD (accepts -, :, / as separators) - name: date-yyyy-mm-dd From 39a53f0c00230340619725c72ae866de6ec5512f Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Tue, 12 Jul 2022 09:38:32 -0700 Subject: [PATCH 08/11] temporary printouts --- .../org/clulab/numeric/mentions/MeasurementMention.scala | 5 ++++- main/src/main/scala/org/clulab/odin/ExtractorEngine.scala | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala index f93d75b63..fdf30c985 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala @@ -17,7 +17,10 @@ class MeasurementMention ( labels: Seq[String], val fromRange: Boolean) extends TextBoundMention(labels, tokenInterval, sentence, document, keep, foundBy, attachments) with Norm { + println("v: " + value.get.head) + println("u: " + unit.get.head) override def neNorm: String = { + println("here") assert(value.nonEmpty) assert(unit.nonEmpty) @@ -30,7 +33,7 @@ class MeasurementMention ( labels: Seq[String], if(numValueOpt.isEmpty) throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!") val unitNorm = UnitNormalizer.norm(unit.get) - + println("Unit norm: " + unitNorm) numValueOpt.get + " " + unitNorm } diff --git a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala index cb687b2cd..5a84567de 100644 --- a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala +++ b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala @@ -50,6 +50,7 @@ class ExtractorEngine(val extractors: Vector[Extractor], val globalAction: Actio if mention.isValid && !state.contains(mention) } yield mention // return the final mentions + for (m <- finalMentions) println("fm: " + m.label + " " + m.text + " " + m.foundBy) finalMentions } From 2b935c669f70f8b953949da68f3fbe4ae99a786d Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Tue, 12 Jul 2022 15:59:34 -0700 Subject: [PATCH 09/11] a convoluted method to fix the Sahel 108 in(ch) Senegal error --- .../resources/org/clulab/numeric/atomic.yml | 2 +- .../numeric/NumericEntityRecognizer.scala | 1 - .../scala/org/clulab/numeric/package.scala | 24 +++++++++++++++++++ .../clulab/processors/clu/CluProcessor.scala | 1 + 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml index 3f80c95c1..b508665c9 100644 --- a/main/src/main/resources/org/clulab/numeric/atomic.yml +++ b/main/src/main/resources/org/clulab/numeric/atomic.yml @@ -14,7 +14,7 @@ rules: priority: ${ rulepriority } type: token pattern: | - [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /NNP|CD/]) + [entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /CD/]) # possible years, from 1ddd to 20dd - name: year diff --git a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index ce912feaa..fd9fbd6fc 100644 --- a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -42,7 +42,6 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions def extractFrom(doc:Document): Seq[Mention] = { // dictionaries val originalEntities = matchLexiconNer(doc) - // grammars var mentions = extractor.extractFrom(doc) diff --git a/main/src/main/scala/org/clulab/numeric/package.scala b/main/src/main/scala/org/clulab/numeric/package.scala index 016e27939..a5845342d 100644 --- a/main/src/main/scala/org/clulab/numeric/package.scala +++ b/main/src/main/scala/org/clulab/numeric/package.scala @@ -5,6 +5,7 @@ import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMe import org.clulab.odin.{EventMention, Mention} import org.clulab.processors.{Document, Sentence} import org.clulab.struct.Interval +import scala.util.control.Breaks._ package object numeric { def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { @@ -92,6 +93,29 @@ package object numeric { addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval) } } + removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT") + } + + def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = { + // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' + // toBeRemovedShortened is entity without BIO- + for(s <- doc.sentences) { + val zippedEntities = s.entities.get.zipWithIndex + for ((e, i) <- zippedEntities) { + if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) { + s.entities.get(i - 1) = "O" + // go in reverse replacing indices and norms in the immediate preceding mention + breakable { + for ((en, j) <- zippedEntities.slice(0, i ).reverse) { + if (en.endsWith(toBeRemovedShortened)) { + s.entities.get(j) = "O" + s.norms.get(j) = "" + } else break + } + } + } + } + } } private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = { diff --git a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala index 9f1dc30d7..97012cea9 100644 --- a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala +++ b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala @@ -617,6 +617,7 @@ class CluProcessor protected ( // numeric entities using our Odin rules // val numericMentions = numericEntityRecognizer.extractFrom(doc) + setLabelsAndNorms(doc, numericMentions) } From 097c89854e233e1a7c9b3fd1730532cb6db29b9b Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Wed, 13 Jul 2022 07:22:10 -0700 Subject: [PATCH 10/11] cleanup --- .../org/clulab/numeric/mentions/MeasurementMention.scala | 4 ---- main/src/main/scala/org/clulab/odin/ExtractorEngine.scala | 1 - .../main/scala/org/clulab/processors/clu/CluProcessor.scala | 1 - 3 files changed, 6 deletions(-) diff --git a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala index fdf30c985..fab60eceb 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/MeasurementMention.scala @@ -17,10 +17,7 @@ class MeasurementMention ( labels: Seq[String], val fromRange: Boolean) extends TextBoundMention(labels, tokenInterval, sentence, document, keep, foundBy, attachments) with Norm { - println("v: " + value.get.head) - println("u: " + unit.get.head) override def neNorm: String = { - println("here") assert(value.nonEmpty) assert(unit.nonEmpty) @@ -33,7 +30,6 @@ class MeasurementMention ( labels: Seq[String], if(numValueOpt.isEmpty) throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!") val unitNorm = UnitNormalizer.norm(unit.get) - println("Unit norm: " + unitNorm) numValueOpt.get + " " + unitNorm } diff --git a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala index 5a84567de..cb687b2cd 100644 --- a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala +++ b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala @@ -50,7 +50,6 @@ class ExtractorEngine(val extractors: Vector[Extractor], val globalAction: Actio if mention.isValid && !state.contains(mention) } yield mention // return the final mentions - for (m <- finalMentions) println("fm: " + m.label + " " + m.text + " " + m.foundBy) finalMentions } diff --git a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala index 97012cea9..3898878f9 100644 --- a/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala +++ b/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala @@ -723,7 +723,6 @@ class CluProcessor protected ( for(sent <- doc.sentences) { val headsWithLabels = parseSentenceWithEisner(sent.words, sent.tags.get, sent.entities.get, embeddings) parserPostProcessing(sent, headsWithLabels) - //println("headsWithLabels: " + headsWithLabels.mkString(" ")) val edges = new ListBuffer[Edge[String]]() val roots = new mutable.HashSet[Int]() From 526bf2a22f5eccf7e193f2fbb3e40030345ef598 Mon Sep 17 00:00:00 2001 From: Maria Alexeeva Date: Wed, 13 Jul 2022 21:30:22 -0700 Subject: [PATCH 11/11] updating version to that in main --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index c4fe1ff11..710945120 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "8.5.1-SNAPSHOT" +version in ThisBuild := "8.5.2-SNAPSHOT"