Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more norms #649

Merged
merged 12 commits into from
Jul 18, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ kgP2O5/ha // kg/ha
Mg ha-1 // Mg/ha
mg / l // mg/l
mg.l-1 // mg/l
mg l-1 // mg/l
mg / kg // mg/kg
mg.kg-1 // mg/kg
mg kg-1 // mg/kg
Expand Down
2 changes: 1 addition & 1 deletion main/src/main/resources/org/clulab/numeric/atomic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rules:
priority: ${ rulepriority }
type: token
pattern: |
[entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]*
[entity=/B-MEASUREMENT-UNIT/ & !word = "DS"] [entity=/I-MEASUREMENT-UNIT/]* (?![tag = /CD/])

# possible years, from 1ddd to 20dd
- name: year
Expand Down
9 changes: 9 additions & 0 deletions main/src/main/resources/org/clulab/numeric/date-ranges.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,12 @@
action: mkDateRangeMentionVagueSeason
pattern: |
/^(1\d\d\d|2\d\d\d)(WS|DS)$/ | @year:PossibleYear (?<season> [word = /^(WS|DS)$/])

- name: date-one-token-year-range
label: Date
priority: ${ rulepriority }
type: token
example: "The areas sown for this 2021/2022 wintering campaign are. [...] rice yield will increase from 3600 in 2000-2009 [...]"
action: mkDateRangeMentionOneTokenYearRange
pattern: |
/^(1\d\d\d|20\d\d)[\-\/](1\d\d\d|20\d\d)$/
4 changes: 2 additions & 2 deletions main/src/main/resources/org/clulab/numeric/dates.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ rules:
example: "It was May 12 of 2000"
action: mkDateMention
pattern: |
@month:PossibleMonth ( @day:PossibleDay )? "of"? @year:PossibleYear
@month:PossibleMonth ( @day:PossibleDay )? /^(of|in)$/? @year:PossibleYear

# American date format, with mandatory year
- name: date-4
Expand Down Expand Up @@ -76,7 +76,7 @@ rules:
example: "26 September in 2011WS"
action: mkDateMention
pattern: |
( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ (?<year> [word = /^(\d\d\d\d)(WS|DS)$/])
( @day:PossibleDay @month:PossibleMonth | @month:PossibleMonth @day:PossibleDay) /^in$/ /^the$/? (?<year> [word = /^(1\d\d\d|20\d\d)(WS|DS)$/])

# Rule for YYYY-MM-DD (accepts -, :, / as separators)
- name: date-yyyy-mm-dd
Expand Down
11 changes: 10 additions & 1 deletion main/src/main/resources/org/clulab/numeric/measurements.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
rules:

- name: measurement-1
label: Measurement
priority: ${ rulepriority }
Expand All @@ -15,10 +16,18 @@ rules:
pattern: |
@number:NumberRange @unit:MeasurementUnit

- name: measurement-3
label: Measurement
priority: ${ rulepriority }
type: token
action: mkSharedMeasurementMention
pattern: |
(@number:Number [!tag = /^NN|LRB/]{0,2} [word = /,|and|to/]*)+ @number:Number @unit:MeasurementUnit

- name: measurement-percentage
label: Percentage
priority: ${ rulepriority }
type: token
action: mkPercentage
pattern: |
@number:Number [word=/(?i)pct|percent|%/]
@number:Number [word=/(?i)pct|percent|%/]
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
example: "Weeding timing ranged from 2 to 17 days"
action: mkNumberRangeMention
pattern: |
/(?i)(from|between)/ @number1:Number /(?i)(to|and|\-)/ @number2:Number
/(?i)(from|between)/ @number1:Number [entity = /MEASUREMENT-UNIT/]* /(?i)(to|and|\-)/ @number2:Number

- name: number-range-2
priority: ${rulepriority}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
def extractFrom(doc:Document): Seq[Mention] = {
// dictionaries
val originalEntities = matchLexiconNer(doc)

// grammars
var mentions = extractor.extractFrom(doc)

Expand All @@ -52,9 +51,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions
}

// global actions *after* all grammars are done
mentions = actions.cleanupAction(mentions)

mentions
actions.cleanupAction(mentions)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,24 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
val convertedMentions = new ArrayBuffer[Mention]()
for(m <- mentions) {
try {
convertedMentions += converter(m)
convertedMentions += converter(m )
} catch {
case e: Exception =>
// sometimes these conversions fail, mainly on broken texts
// let's be robust here: report the error and move on
System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...")
e.printStackTrace()
Comment on lines +24 to +25
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably all of processors needs to be reviewed for its use of println, System.out, and System.err. These should probably be logged rather than printed. That's not your problem.

}
}
convertedMentions
}
Comment on lines +19 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't necessary, but one way to do it without the buffer is

    for {
      m <- mentions
      converted <- Try(converter(m))
          .recover { case throwable: Throwable =>
            System.err.println(s"WARNING: $converterName conversion failed! Recovering and continuing...")
            throwable.printStackTrace()
            Seq.empty[Mention]
          }
          .get
    } yield converted

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. With processors I basically go the path of fewest changes :)


/** Converts a sequence of mentions to new types given the converter function */
private def convertWithOneToManyConverter(mentions: Seq[Mention], converter: Mention => Seq[Mention], converterName: String): Seq[Mention] = {
val convertedMentions = new ArrayBuffer[Mention]()
for(m <- mentions) {
try {
convertedMentions ++= converter(m )
} catch {
case e: Exception =>
// sometimes these conversions fail, mainly on broken texts
Expand All @@ -38,6 +55,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
convert(mentions, toMeasurementMention, "toMeasurementMention")
}

/** Constructs a MeasurementMention from a token pattern */
def mkSharedMeasurementMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
convertWithOneToManyConverter(mentions, toSharedMeasurementMention, "toSharedMeasurementMention")
}

def mkPercentage(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toPercentageMention, "toPercentageMention")
}
Expand Down Expand Up @@ -120,6 +142,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer) extends Actions {
convert(mentions, toDateRangeMentionFromVagueSeason, "mkDateRangeMentionVagueSeason")
}

/** Constructs a DateRangeMention from a token pattern */
def mkDateRangeMentionOneTokenYearRange(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toDateRangeMentionFromOneTokenYearRange, "mkDateRangeMentionOneTokenYearRange")
}

/** Constructs a DateMention from a token pattern */
def mkDateMention(mentions: Seq[Mention], state: State): Seq[Mention] = {
convert(mentions, toDateMention, "toDateMention")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ class MeasurementMention ( labels: Seq[String],
if(numValueOpt.isEmpty)
throw new RuntimeException(s"ERROR: could not parse the number [${value.mkString(" ")}] in the measurement ${raw.mkString(" ")}!")
val unitNorm = UnitNormalizer.norm(unit.get)

numValueOpt.get + " " + unitNorm
}

Expand Down
50 changes: 49 additions & 1 deletion main/src/main/scala/org/clulab/numeric/mentions/package.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.clulab.numeric

import org.clulab.odin.{Mention, RelationMention, TextBoundMention}
import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention}
import org.clulab.struct.Interval

import java.util.regex.Pattern

package object mentions {
Expand Down Expand Up @@ -68,6 +70,36 @@ package object mentions {
throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
}

def toSharedMeasurementMention(mention: Mention): Seq[Mention] = mention match {
case m: MeasurementMention => Seq(m)

case m: RelationMention =>
mention.arguments("number").sortBy(_.tokenInterval).map { a =>
val newArgs = Seq(m.arguments("unit").head, a).sortBy(_.tokenInterval)
// if num and unit are adjacent, include both in new token int, else use the token int of the number arg
val newTokInt = if (newArgs.last.start - newArgs.head.end == 1) {
Interval(newArgs.head.tokenInterval.start, newArgs.last.tokenInterval.end)
} else a.tokenInterval
new MeasurementMention(
m.labels,
a.tokenInterval,
m.sentence,
m.document,
m.keep,
m.foundBy,
m.attachments,
Some(a.words),
getArgWords("unit", m),
false
)
}


case m =>
throw new RuntimeException(s"ERROR: cannot convert mention of type [${m.getClass.toString}] to MeasurementMention!")
}


def toPercentageMention(mention: Mention): PercentageMention = mention match {
case m: PercentageMention => m

Expand Down Expand Up @@ -470,6 +502,22 @@ package object mentions {
throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
}

/** handles one token year ranges, e.g., 2020/2021 and 2020-2021 */
def toDateRangeMentionFromOneTokenYearRange(mention: Mention): DateRangeMention = mention match {
case m: DateRangeMention => m

case m: TextBoundMention =>
val years = m.text.split("[-\\/]")
DateRangeMention(
m,
TempEvalFormatter.mkDate(None, None, Some(Seq(years.head))),
TempEvalFormatter.mkDate(None, None, Some(Seq(years.last)))
)

case m =>
throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!")
}

def toDateMention(mention: Mention): DateMention = mention match {
case m: DateMention => m

Expand Down
24 changes: 24 additions & 0 deletions main/src/main/scala/org/clulab/numeric/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMe
import org.clulab.odin.{EventMention, Mention}
import org.clulab.processors.{Document, Sentence}
import org.clulab.struct.Interval
import scala.util.control.Breaks._

package object numeric {
def displayMentions(mentions: Seq[Mention], doc: Document): Unit = {
Expand Down Expand Up @@ -92,6 +93,29 @@ package object numeric {
addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval)
}
}
removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT")
}

def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = {
// removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal'
// toBeRemovedShortened is entity without BIO-
for(s <- doc.sentences) {
val zippedEntities = s.entities.get.zipWithIndex
for ((e, i) <- zippedEntities) {
if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) {
s.entities.get(i - 1) = "O"
// go in reverse replacing indices and norms in the immediate preceding mention
breakable {
for ((en, j) <- zippedEntities.slice(0, i ).reverse) {
if (en.endsWith(toBeRemovedShortened)) {
s.entities.get(j) = "O"
s.norms.get(j) = ""
} else break
}
}
}
}
}
}

private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ class CluProcessor protected (
// numeric entities using our Odin rules
//
val numericMentions = numericEntityRecognizer.extractFrom(doc)

setLabelsAndNorms(doc, numericMentions)
}

Expand Down Expand Up @@ -772,7 +773,6 @@ class CluProcessor protected (
for(sent <- doc.sentences) {
val headsWithLabels = parseSentenceWithEisner(sent.words, sent.tags.get, sent.entities.get, embeddings)
parserPostProcessing(sent, headsWithLabels)
//println("headsWithLabels: " + headsWithLabels.mkString(" "))

val edges = new ListBuffer[Edge[String]]()
val roots = new mutable.HashSet[Int]()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
ensure(sentence= "19:02.", Interval(0, 1), goldEntity= "DATE", goldNorm= "XX19-02-XX")
}

it should "recognize one token year ranges" in {
ensure(sentence= "2021/2022", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2021-XX-XX -- 2022-XX-XX")
ensure(sentence= "2000-2009", Interval(0, 1), goldEntity= "DATE-RANGE", goldNorm= "2000-XX-XX -- 2009-XX-XX")
}

it should "recognize numeric dates of form month of year" in {
ensure(sentence= "sowing date is best in May of 2020", Interval(5, 8), goldEntity= "DATE", goldNorm= "2020-05-XX")
ensure(sentence= "sowing date in July of 2020", Interval(3, 6), goldEntity= "DATE", goldNorm= "2020-07-XX")
Expand All @@ -170,6 +175,11 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
ensure(sentence= "February 21 of 1002", Interval(0, 4), goldEntity= "DATE", goldNorm= "1002-02-21")
}

it should "recognize numeric dates of form month date in year" in {
ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(6, 10), "DATE", "2010-07-01")
ensure("The first sowing dates started on July 1st in 2010 and on July 8th in 2011", Interval(12, 16), "DATE", "2011-07-08")
}

it should "recognize dates with ordinal days" in {
ensure(sentence = "Planting dates are between July 1st and August 2nd.", Interval(3, 9), goldEntity = "DATE-RANGE", "XXXX-07-01 -- XXXX-08-02")
}
Expand Down Expand Up @@ -322,6 +332,12 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
it should "recognize date ranges with vague seasons" in {
ensure("Seeding dates ranged from 22 August to 26 September in 2011WS.",
Interval(3, 11), "DATE-RANGE", "2011-08-22 -- 2011-09-26")
ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
Interval(13, 21), "DATE-RANGE", "1999-08-03 -- 1999-08-13"
)
ensure("The planned timing for the first split was 23 days after sowing (from 3 to 13 August in the 1999WS and from 14 to 25 August in the 2000WS",
Interval(22, 29), "DATE-RANGE", "2000-08-14 -- 2000-08-25"
)
}

it should "recognize date ranges (month/day) with vague seasons" in {
Expand Down Expand Up @@ -438,6 +454,23 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {

// TODO: not sure what should be the output of such measurement '3 or 4 days'
ensure(sentence= "and lasted 3 or 4 days in both wet seasons", Interval(4, 6), goldEntity="MEASUREMENT", goldNorm="4.0 d")
ensure(sentence= "ranged from 2.7 t ha-1 to 7.1 t ha-1", Interval(1, 9), goldEntity="MEASUREMENT", goldNorm="2.7 -- 7.1 t/ha")
ensure(sentence= "yields were between 8.8 t ha-1 and 9.2 t ha-1", Interval(2, 10), goldEntity="MEASUREMENT", goldNorm="8.8 -- 9.2 t/ha")
}

it should "recognize shared units" in {
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(7,8), goldEntity="MEASUREMENT", goldNorm="6.4 t/ha")
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(9,10), goldEntity="MEASUREMENT", goldNorm="7.9 t/ha")
ensure(sentence = "Target yields on average were set to 6.4, 7.9, and 7.1 t/ha in 2011WS , 2012DS , and 2013DS , respectively.", Interval(12,13), goldEntity="MEASUREMENT", goldNorm="7.1 t/ha")
ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(3, 4), goldEntity="MEASUREMENT", goldNorm="9.0 t/ha")
ensure(sentence = "was estimated at 9 and 10 t / ha", Interval(5, 9), goldEntity="MEASUREMENT", goldNorm="10.0 t/ha")
ensure(sentence = "+ 100 kg ha-1 urea at 20 das + 50 kg ha-1 urea at 50 das", Interval(6, 8), goldEntity="O", goldNorm="")
ensure(sentence = "yield will increase from 3600 in 2000-2009 to 4500 kg ha-1 in 2090-2099", Interval(4, 5), goldEntity="MEASUREMENT", goldNorm="3600.0 kg/ha")
}

it should "not recognize preposition `in` as `inch`" in {
ensure(sentence = "released as Sahel 108 in Senegal in 1994", Interval(3,5), goldEntity="O", goldNorm="")
ensure(sentence = "92% grew Sahel 108 in 2012DS", Interval(3,5), goldEntity="O", goldNorm="")
}

// TODO: this requires non trivial changes to the tokenizer
Expand Down Expand Up @@ -562,16 +595,21 @@ class TestNumericEntityRecognition extends FlatSpec with Matchers {
println("Entities: " + entities.mkString(", "))
println("Norms: " + norms.mkString(", "))

if(goldEntity.nonEmpty) {
if (goldEntity.nonEmpty) {
var first = true
for (i <- span.indices) {
val prefix = if (first) "B-" else "I-"
val label = prefix + goldEntity
if (goldEntity == "O") {
norms(i) should be(goldNorm)
} else {
val prefix = if (first) "B-" else "I-"
val label = prefix + goldEntity

entities(i) should be(label)
norms(i) should be(goldNorm)

entities(i) should be(label)
norms(i) should be(goldNorm)
first = false
}

first = false
}
}
}
Expand Down