Skip to content

Commit

Permalink
Merge pull request #649 from clulab/masha-norms
Browse files Browse the repository at this point in the history
more norms
  • Loading branch information
kwalcock authored Jul 18, 2022
2 parents 8850190 + 9b7335c commit 23799f6
Show file tree
Hide file tree
Showing 13 changed files with 171 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ kgP2O5/ha // kg/ha
Mg ha-1 // Mg/ha
mg / l // mg/l
mg.l-1 // mg/l
mg l-1 // mg/l
mg / kg // mg/kg
mg.kg-1 // mg/kg
mg kg-1 // mg/kg
Expand Down
2 changes: 1 addition & 1 deletion main/src/main/resources/org/clulab/numeric/atomic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rules:
priority: ${ rulepriority }
type: token
pattern: |
[entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]*
[entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /CD/])
# possible years, from 1ddd to 20dd
- name: year
Expand Down
9 changes: 9 additions & 0 deletions main/src/main/resources/org/clulab/numeric/date-ranges.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,12 @@
action: mkDateRangeMentionVagueSeason
pattern: |
/^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (?<season> [word = /^(WS|DS)$/])
- name: date-one-token-year-range
label: Date
priority: ${ rulepriority }
type: token
example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]"
action: mkDateRangeMentionOneTokenYearRange
pattern: |
/^(1\d\d\d|20\d\d)[\-\/](1\d\d\d|20\d\d)$/
4 changes: 2 additions & 2 deletions main/src/main/resources/org/clulab/numeric/dates.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ rules:
example: "It was May 12 of 2000"
action: mkDateMention
pattern: |
@month:PossibleMonth ( @day:PossibleDay )? "of"? @year:PossibleYear
@month:PossibleMonth ( @day:PossibleDay )? /^(of|in)$/? @year:PossibleYear
# American date format, with mandatory year
- name: date-4
Expand Down Expand Up @@ -76,7 +76,7 @@ rules:
example: "26 September in 2011WS"
action: mkDateMention
pattern: |
( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (?<year> [word = /^(\d\d\d\d)(WS|DS)$/])
( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (?<year> [word = /^(1\d\d\d|20\d\d)(WS|DS)$/])
# Rule for YYYY-MM-DD (accepts -, :, / as separators)
- name: date-yyyy-mm-dd
Expand Down
11 changes: 10 additions & 1 deletion main/src/main/resources/org/clulab/numeric/measurements.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
rules:

- name: measurement-1
label: Measurement
priority: ${ rulepriority }
Expand All @@ -15,10 +16,18 @@ rules:
pattern: |
@number:NumberRange @unit:MeasurementUnit
- name: measurement-3
label: Measurement
priority: ${ rulepriority }
type: token
action: mkSharedMeasurementMention
pattern: |
(@number:Number [!tag = /^NN|LRB/]{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit
- name: measurement-percentage
label: Percentage
priority: ${ rulepriority }
type: token
action: mkPercentage
pattern: |
@number:Number [word=/(?i)pct|percent|%/]
@number:Number [word=/(?i)pct|percent|%/]
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
example: "Weeding timing ranged from 2 to 17 days"
action: mkNumberRangeMention
pattern: |
/(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number
/(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number
- name: number-range-2
priority: ${rulepriority}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
def extractFrom(doc:Document): Seq[Mention] = {
// dictionaries
val originalEntities = matchLexiconNer(doc)

// grammars
var mentions = extractor.extractFrom(doc)

Expand All @@ -52,9 +51,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
}

// global actions *after* all grammars are done
mentions = actions.cleanupAction(mentions)

mentions
actions.cleanupAction(mentions)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
val convertedMentions = new ArrayBuffer[Mention]()
for(m <- mentions) {
try {
convertedMentions += converter(m)
convertedMentions += converter(m )
} catch {
case e: Exception =>
// sometimes these conversions fail, mainly on broken texts
// let's be robust here: report the error and move on
System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...")
e.printStackTrace()
}
}
convertedMentions
}

/** Converts a sequence of mentions to new types given the converter function */
private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = {
val convertedMentions = new ArrayBuffer[Mention]()
for(m <- mentions) {
try {
convertedMentions ++= converter(m )
} catch {
case e: Exception =>
// sometimes these conversions fail, mainly on broken texts
Expand All @@ -38,6 +55,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
convert(mentions, toMeasurementMention, "toMeasurementMention")
}

/** Constructs a MeasurementMention from a token pattern */
def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention")
}

def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toPercentageMention, "toPercentageMention")
}
Expand Down Expand Up @@ -120,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason")
}

/** Constructs a DateRangeMention from a token pattern */
def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange")
}

/** Constructs a DateMention from a token pattern */
def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toDateMention, "toDateMention")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ class MeasurementMention ( labels: Seq[String],
if(numValueOpt.isEmpty)
throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!")
val unitNorm = UnitNormalizer.norm(unit.get)

numValueOpt.get + " " + unitNorm
}

Expand Down
50 changes: 49 additions & 1 deletion main/src/main/scala/org/clulab/numeric/mentions/package.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.clulab.numeric

import org.clulab.odin.{Mention, RelationMention, TextBoundMention}
import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention}
import org.clulab.struct.Interval

import java.util.regex.Pattern

package object mentions {
Expand Down Expand Up @@ -68,6 +70,36 @@ package object mentions {
throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
}

def toSharedMeasurementMention(mention: Mention): Seq[Mention] = mention match {
case m: MeasurementMention => Seq(m)

case m: RelationMention =>
mention.arguments("number").sortBy(_.tokenInterval).map { a =>
val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval)
// if num and unit are adjacent, include both in new token int, else use the token int of the number arg
val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) {
Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end)
} else a.tokenInterval
new MeasurementMention(
m.labels,
a.tokenInterval,
m.sentence,
m.document,
m.keep,
m.foundBy,
m.attachments,
Some(a.words),
getArgWords("unit", m),
false
)
}


case m =>
throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
}


def toPercentageMention(mention: Mention): PercentageMention = mention match {
case m: PercentageMention => m

Expand Down Expand Up @@ -470,6 +502,22 @@ package object mentions {
throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
}

/** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */
def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention = mention match {
case m: DateRangeMention => m

case m: TextBoundMention =>
val years = m.text.split("[-\\/]")
DateRangeMention(
m,
TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))),
TempEvalFormatter.mkDate(None, None, Some(Seq(years.last)))
)

case m =>
throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
}

def toDateMention(mention: Mention): DateMention = mention match {
case m: DateMention => m

Expand Down
24 changes: 24 additions & 0 deletions main/src/main/scala/org/clulab/numeric/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMe
import org.clulab.odin.{EventMention, Mention}
import org.clulab.processors.{Document, Sentence}
import org.clulab.struct.Interval
import scala.util.control.Breaks._

package object numeric {
def displayMentions(mentions: Seq[Mention], doc: Document): Unit = {
Expand Down Expand Up @@ -92,6 +93,29 @@ package object numeric {
addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval)
}
}
removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT")
}

def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = {
// removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal'
// toBeRemovedShortened is entity without BIO-
for(s <- doc.sentences) {
val zippedEntities = s.entities.get.zipWithIndex
for ((e, i) <- zippedEntities) {
if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) {
s.entities.get(i - 1) = "O"
// go in reverse replacing indices and norms in the immediate preceding mention
breakable {
for ((en, j) <- zippedEntities.slice(0, i ).reverse) {
if (en.endsWith(toBeRemovedShortened)) {
s.entities.get(j) = "O"
s.norms.get(j) = ""
} else break
}
}
}
}
}
}

private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ class CluProcessor protected (
// numeric entities using our Odin rules
//
val numericMentions = numericEntityRecognizer.extractFrom(doc)

setLabelsAndNorms(doc, numericMentions)
}

Expand Down Expand Up @@ -772,7 +773,6 @@ class CluProcessor protected (
for(sent <- doc.sentences) {
val headsWithLabels = parseSentenceWithEisner(sent.words, sent.tags.get, sent.entities.get, embeddings)
parserPostProcessing(sent, headsWithLabels)
//println("headsWithLabels: " + headsWithLabels.mkString(" "))

val edges = new ListBuffer[Edge[String]]()
val roots = new mutable.HashSet[Int]()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX")
}

it should "recognize one token year ranges" in {
ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX")
ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX")
}

it should "recognize numeric dates of form month of year" in {
ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX")
ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX")
Expand All @@ -170,6 +175,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
ensure(sentence= "February 21 of 1002", Interval(0, 4), goldEntity= "DATE", goldNorm= "1002-02-21")
}

it should "recognize numeric dates of form month date in year" in {
ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(6, 10), "DATE", "2010-07-01")
ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(12, 16), "DATE", "2011-07-08")
}

it should "recognize dates with ordinal days" in {
ensure(sentence = "Planting dates are between July 1st and August 2nd.", Interval(3, 9), goldEntity = "DATE-RANGE", "XXXX-07-01 -- XXXX-08-02")
}
Expand Down Expand Up @@ -322,6 +332,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
it should "recognize date ranges with vague seasons" in {
ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.",
Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26")
ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13"
)
ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25"
)
}

it should "recognize date ranges (month/day) with vague seasons" in {
Expand Down Expand Up @@ -438,6 +454,23 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {

// TODO: not sure what should be the output of such measurement '3 or 4 days'
ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d")
ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha")
ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha")
}

it should "recognize shared units" in {
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha")
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha")
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha")
ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha")
ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha")
ensure(sentence = "+ 100 kg ha-1 urea at 20 das + 50 kg ha-1 urea at 50 das", Interval(6, 8), goldEntity="O", goldNorm="")
ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha")
}

it should "not recognize preposition `in` as `inch`" in {
ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="")
ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="")
}

// TODO: this requires non trivial changes to the tokenizer
Expand Down Expand Up @@ -562,16 +595,21 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
println("Entities: " + entities.mkString(", "))
println("Norms: " + norms.mkString(", "))

if(goldEntity.nonEmpty) {
if (goldEntity.nonEmpty) {
var first = true
for (i <- span.indices) {
val prefix = if (first) "B-" else "I-"
val label = prefix + goldEntity
if (goldEntity == "O") {
norms(i) should be(goldNorm)
} else {
val prefix = if (first) "B-" else "I-"
val label = prefix + goldEntity

entities(i) should be(label)
norms(i) should be(goldNorm)

entities(i) should be(label)
norms(i) should be(goldNorm)
first = false
}

first = false
}
}
}
Expand Down

0 comments on commit 23799f6

Please sign in to comment.