Merge branch 'master' into masha-norms

clulab · Jul 15, 2022 · 9b7335c · 9b7335c
2 parents 526bf2a + 8850190
commit 9b7335c
Show file tree

Hide file tree

Showing 29 changed files with 392 additions and 153 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,9 +12,9 @@ Makefile
 .bloop/
 .metals/
 .vscode/
-project/
 
 # sbt specific
+.bsp
 dist/*
 target/
 lib_managed/

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,7 @@
-+ **8.5.1** - Avoid empty words 
++ **8.5.2** - Processors no longer uses Scala logging nor does it publish its runtime use of logback so that it isn't forced onto clients.  logback is still included in ivy.xml for publishLocal, however.
++ **8.5.1** - Normalize years with relative seasons and one-token numerical ranges
++ **8.5.1** - Avoid empty words from tokenizer, shorten evaluation filenames, add progress bars 
++ **8.5.1** - Recognize date from Month/Month to Month/Month 
 + **8.5.0** - Small parser improvements; added model for case restoration
 + **8.4.9** - Parser improvements: slimmer models; parsing using the Eisner algorithm
 + **8.4.9** - Removed support for embedding spans in Metal

diff --git a/build.sbt b/build.sbt
@@ -1,6 +1,6 @@
 val scala11 = "2.11.12" // up to 2.11.12
-val scala12 = "2.12.13" // up to 2.12.13
-val scala13 = "2.13.5"  // up to 2.13.5
+val scala12 = "2.12.15" // up to 2.12.15
+val scala13 = "2.13.8"  // up to 2.13.8
 // scala13 is waiting on ai.lum %% common.
 
 ThisBuild / crossScalaVersions := Seq(scala12, scala11)

diff --git a/corenlp/src/main/scala/org/clulab/processors/shallownlp/ShallowNLPProcessor.scala b/corenlp/src/main/scala/org/clulab/processors/shallownlp/ShallowNLPProcessor.scala
@@ -354,15 +354,15 @@ object ShallowNLPProcessor {
       // character offsets and actual text
       val sentStartOffset = sentence.startOffsets.head
       val sentEndOffset = sentence.endOffsets.last
-      crtSent.set(classOf[CharacterOffsetBeginAnnotation], new Integer(sentStartOffset))
-      crtSent.set(classOf[CharacterOffsetEndAnnotation], new Integer(sentEndOffset))
+      crtSent.set(classOf[CharacterOffsetBeginAnnotation], Integer.valueOf(sentStartOffset))
+      crtSent.set(classOf[CharacterOffsetEndAnnotation], Integer.valueOf(sentEndOffset))
       crtSent.set(classOf[TextAnnotation], doc.text.get.substring(sentStartOffset, sentEndOffset))
 
       // token and sentence offsets
-      crtSent.set(classOf[TokenBeginAnnotation], new Integer(tokenOffset))
+      crtSent.set(classOf[TokenBeginAnnotation], Integer.valueOf(tokenOffset))
       tokenOffset += crtTokens.size()
-      crtSent.set(classOf[TokenEndAnnotation], new Integer(tokenOffset))
-      crtSent.set(classOf[SentenceIndexAnnotation], new Integer(sentOffset)) // Stanford counts sentences starting from 0
+      crtSent.set(classOf[TokenEndAnnotation], Integer.valueOf(tokenOffset))
+      crtSent.set(classOf[SentenceIndexAnnotation], Integer.valueOf(sentOffset)) // Stanford counts sentences starting from 0
 
       sentencesAnnotation.add(crtSent)
       sentOffset += 1

diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
@@ -214,7 +214,7 @@ GEM
       jekyll-seo-tag (~> 2.1)
     minitest (5.14.3)
     multipart-post (2.1.1)
-    nokogiri (1.13.4-x86_64-linux)
+    nokogiri (1.13.6-x86_64-linux)
       racc (~> 1.4)
     octokit (4.20.0)
       faraday (>= 0.9)

diff --git a/main/build.sbt b/main/build.sbt
@@ -6,7 +6,7 @@ pomIncludeRepository := { (repo: MavenRepository) =>
 }
 
 // for processors-models
-resolvers += "Artifactory" at "http://artifactory.cs.arizona.edu:8081/artifactory/sbt-release"
+resolvers += ("Artifactory" at "http://artifactory.cs.arizona.edu:8081/artifactory/sbt-release").withAllowInsecureProtocol(true)
 
 libraryDependencies ++= {
   val json4sVersion = "3.5.2"
@@ -29,11 +29,14 @@ libraryDependencies ++= {
     "org.clulab"                  % "lemport"                  % "0.9.10", // Portuguese lemmatizer
     "de.jollyday"                 % "jollyday"                 % "0.5.10", // for holidays normalization
     // logging
-    "ch.qos.logback"              % "logback-classic"          % "1.2.8",  // up to 1.2.8; less than 1.2 is vulnerable
-    "com.typesafe.scala-logging" %% "scala-logging"            % "3.7.2",
+    // The Scala interface is not used in processors.
+    // "com.typesafe.scala-logging" %% "scala-logging"            % "3.7.2",
+    // Instead, all code makes use of the Java interface.
     "org.slf4j"                   % "slf4j-api"                % "1.7.10",
+    // Local logging is provided here and not published.
+    "ch.qos.logback"              % "logback-classic"          % "1.2.8",  // up to 1.2.8; less than 1.2 is vulnerable
     // testing
-    "org.scalatest"              %% "scalatest"                % "3.0.1"  % Test,
+    "org.scalatest"              %% "scalatest"                % "3.0.1" % Test,
     // trained models for local ML models used in both main and corenlp
     // These are stored in the CLU lab Artifactory not maven!
     "org.clulab"                  % "glove-840b-300d-10f-kryo" % "1.0.0",

diff --git a/main/src/main/scala/org/clulab/dynet/Eisner.scala b/main/src/main/scala/org/clulab/dynet/Eisner.scala
@@ -237,24 +237,26 @@ class Eisner {
             dep.head - 1
           }
         val label = dep.label
-        heads(dep.mod - 1) = Tuple2(head, label)
+        heads(dep.mod - 1) = (head, label)
       }
     } else {
       // Eisner failed to produce a complete tree; revert to the greedy inference
       for(i <- scores.indices) {
         val relativeHead = scores(i).maxBy(_._2)._1.toInt
         val depMod = i + 1
         val depHead = if (relativeHead == 0) 0 else depMod + relativeHead
-        val label = dependencies(depMod)(depHead).label
+        // lift() checks the index, and Option(_) checks for nulls.
+        val valid = dependencies(depMod).lift(depHead).flatMap(Option(_)).isDefined
+        val label = if (valid) dependencies(depMod)(depHead).label else "root"
         val head =
           if(generateRelativeHeads) {
             // we are storing *relative* head positions here
-            relativeHead
+            if (valid) relativeHead else 0
           } else {
             // we are storing absolute heads, starting at offset 0
-            depHead - 1
+            if (valid) depHead - 1 else -1
           }
-        heads(i) = Tuple2(head, label)
+        heads(i) = (head, label)
       }
     }
     heads

diff --git a/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala b/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala
@@ -3,6 +3,7 @@ package org.clulab.embeddings
 import org.clulab.utils.Closer.AutoCloser
 import org.clulab.utils.InputStreamer
 import org.clulab.utils.InputStreamer.StreamResult
+import org.clulab.utils.ThreadUtils.NamedFuture
 
 import scala.collection.mutable
 import scala.concurrent.Await
@@ -11,7 +12,6 @@ import scala.concurrent.duration.Duration
 
 /** Manages a pool of word embedding maps, so we do not load them more than once */
 object WordEmbeddingMapPool {
-  import scala.concurrent.ExecutionContext.Implicits.global
 
   case class Key(name: String, compact: Boolean)
 
@@ -34,19 +34,22 @@ object WordEmbeddingMapPool {
 
   /** Fetches an embedding from the pool if it exists, or creates it otherwise */
   def getOrElseCreate(name: String, compact: Boolean = false, fileLocation: String = "", resourceLocation: String = ""): WordEmbeddingMap = {
+    // Using the global execution context may be a bad strategy.
+    import scala.concurrent.ExecutionContext.Implicits.global
+
     val wordEmbeddingMapFuture =
       if (enabled)
         this.synchronized {
           // Access the shared pool inside the synchronized section.
           pool.getOrElseUpdate(
             Key(name, compact),
-            Future {
+            NamedFuture("enabled WordEmbeddingMapPool.loadEmbedding") {
               loadEmbedding(name, fileLocation, resourceLocation, compact = compact)
             }
           )
         }
       else
-        Future {
+        NamedFuture("disabled WordEmbeddingMapPool.loadEmbedding") {
           loadEmbedding(name, fileLocation, resourceLocation, compact = compact)
         }
     // Wait for the result outside the synchronized section.

diff --git a/main/src/main/scala/org/clulab/learning/Dataset.scala b/main/src/main/scala/org/clulab/learning/Dataset.scala
@@ -7,9 +7,8 @@ import org.clulab.struct.Lexicon
 
 import scala.io.{BufferedSource, Source}
 import java.util.zip.GZIPInputStream
-import java.io.{BufferedInputStream, FileInputStream, FileWriter, PrintWriter}
-
-import org.slf4j.LoggerFactory
+import java.io.{FileWriter, PrintWriter}
+import org.slf4j.{Logger, LoggerFactory}
 import RVFDataset._
 import org.clulab.utils.Files
 
@@ -22,19 +21,19 @@ import org.clulab.utils.Files
 abstract class Dataset[L, F](
   val labelLexicon:Lexicon[L],
   val featureLexicon:Lexicon[F],
-  val labels:ArrayBuffer[Int]) extends Serializable {
+  val labels:ArrayBuffer[Int]) extends IndexedSeq[Datum[L, F]] with Serializable {
 
   def this() = this(new Lexicon[L], new Lexicon[F], new ArrayBuffer[Int])
 
   def += (datum:Datum[L, F]): Unit
 
-  def numFeatures = featureLexicon.size
-  def numLabels = labelLexicon.size
+  def numFeatures: Int = featureLexicon.size
+  def numLabels: Int = labelLexicon.size
 
   /** number of training examples */
-  def size = labels.size
+  override def size: Int = labels.size
 
-  def indices = 0 until size
+  override def indices: Range = labels.indices
 
   def featuresCounter(datumOffset:Int):Counter[Int]
 
@@ -52,6 +51,10 @@ abstract class Dataset[L, F](
 
   /** Convert this dataset to a CounterDataset */
   def toCounterDataset:CounterDataset[L, F]
+
+  override def length: Int = size
+
+  override def apply(idx: Int): Datum[L, F] = mkDatum(idx)
 }
 
 /**
@@ -124,7 +127,8 @@ class BVFDataset[L, F] (
 
     // sort all features in descending order of their IG
     val fb = new ListBuffer[(Int, Double)]
-    for(f <- igs.keySet) fb += new Tuple2(f, igs.get(f).get.ig(total))
+    // The (( here and elsewhere in this file is for Scala 2.11.
+    for(f <- igs.keySet) fb += ((f, igs(f).ig(total)))
     val sortedFeats = fb.sortBy(- _._2).toArray
 
     // keep the top pctToKeep
@@ -197,7 +201,7 @@ class BVFDataset[L, F] (
     for(i <- feats.indices) {
       val f = feats(i)
       if(featureIndexMap.contains(f)) {
-        newFeats += featureIndexMap.get(f).get
+        newFeats += featureIndexMap(f)
       }
     }
 
@@ -248,7 +252,7 @@ class RVFDataset[L, F] (
   private def featuresCounterToArray(fs:Counter[F]):Array[(Int, Double)] = {
     val fb = new ListBuffer[(Int, Double)]
     for(f <- fs.keySet) {
-      fb += new Tuple2[Int, Double](featureLexicon.add(f), fs.getCount(f))
+      fb += ((featureLexicon.add(f), fs.getCount(f)))
     }
     fb.sortBy(_._1).toArray
   }
@@ -327,7 +331,7 @@ class RVFDataset[L, F] (
       val f = feats(i)
       val v = vals(i)
       if(featureIndexMap.contains(f)) {
-        newFeats += featureIndexMap.get(f).get
+        newFeats += featureIndexMap(f)
         newVals += v
       }
     }
@@ -370,12 +374,12 @@ class InformationGain( var datumCount:Int = 0,
     pos + neg
   }
 
-  def pWith(total:InformationGain) = datumCount.toDouble / total.datumCount.toDouble
-  def pWithout(total:InformationGain) = (total.datumCount - datumCount).toDouble / total.datumCount.toDouble
+  def pWith(total:InformationGain): Double = datumCount.toDouble / total.datumCount.toDouble
+  def pWithout(total:InformationGain): Double = (total.datumCount - datumCount).toDouble / total.datumCount.toDouble
 }
 
 object RVFDataset {
-  val logger = LoggerFactory.getLogger(classOf[RVFDataset[String, String]])
+  val logger: Logger = LoggerFactory.getLogger(classOf[RVFDataset[String, String]])
 
   def mkDatasetFromSvmLightResource(path: String): RVFDataset[Int, String] = {
     val stream = getClass.getClassLoader.getResourceAsStream(path)
@@ -413,7 +417,7 @@ object RVFDataset {
       content = content.trim
       // logger.debug("Parsing line: [" + content + "]")
 
-      if(content.length > 0) {
+      if(content.nonEmpty) {
         val bits = content.split("\\s+")
 
         var label = bits(0)
@@ -451,7 +455,7 @@ object RVFDataset {
         val fi = featureLexicon.get(k)
         if(fi.isDefined) {
           // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}")
-          fs += new Tuple2(fi.get + 1, c.getCount(k))
+          fs += ((fi.get + 1, c.getCount(k)))
         }
       }
       val fss = fs.toList.sortBy(_._1)
@@ -498,7 +502,7 @@ object RVFDataset {
       content = content.trim
       //logger.debug("Parsing line: " + content)
 
-      if(content.length > 0) {
+      if(content.nonEmpty) {
         val bits = content.split("\\s+")
 
         var label = bits(0)

diff --git a/main/src/main/scala/org/clulab/odin/Mention.scala b/main/src/main/scala/org/clulab/odin/Mention.scala
@@ -45,12 +45,24 @@ trait Mention extends Equals with Ordered[Mention] with Serializable {
     case m: EventMention => m.newWithAttachment(mod)
   }
 
+  def withAttachments(mods: Seq[Attachment]): Mention = this match {
+    case m: TextBoundMention => m.newWithAttachments(mods)
+    case m: RelationMention => m.newWithAttachments(mods)
+    case m: EventMention => m.newWithAttachments(mods)
+  }
+
   def withoutAttachment(mod: Attachment): Mention = this match {
     case m: TextBoundMention => m.newWithoutAttachment(mod)
     case m: RelationMention => m.newWithoutAttachment(mod)
     case m: EventMention => m.newWithoutAttachment(mod)
   }
 
+  def withoutAttachments(mods: Seq[Attachment]): Mention = this match {
+    case m: TextBoundMention => m.newWithoutAttachments(mods)
+    case m: RelationMention => m.newWithoutAttachments(mods)
+    case m: EventMention => m.newWithoutAttachments(mods)
+  }
+
   val paths: Map[String, Map[Mention, SynPath]]
 
   def getPath(argRole: String, mention: Mention): SynPath = paths(argRole)(mention)
@@ -236,10 +248,17 @@ class TextBoundMention(
     copy(attachments = this.attachments + mod)
   }
 
+  def newWithAttachments(mods: Seq[Attachment]): TextBoundMention = {
+    copy(attachments = this.attachments ++ mods)
+  }
+
   def newWithoutAttachment(mod: Attachment): TextBoundMention = {
     copy(attachments = this.attachments - mod)
   }
 
+  def newWithoutAttachments(mods: Seq[Attachment]): TextBoundMention = {
+    copy(attachments = this.attachments -- mods)
+  }
 }
 
 // NOTE that event mentions *may* have no arguments
@@ -324,10 +343,18 @@ class EventMention(
     copy(attachments = this.attachments + mod)
   }
 
+  def newWithAttachments(mods: Seq[Attachment]): EventMention = {
+    copy(attachments = this.attachments ++ mods)
+  }
+
   def newWithoutAttachment(mod: Attachment): EventMention = {
     copy(attachments = this.attachments - mod)
   }
 
+  def newWithoutAttachments(mods: Seq[Attachment]): EventMention = {
+    copy(attachments = this.attachments -- mods)
+  }
+
   // Convert an EventMention to a RelationMention by deleting the trigger
   def toRelationMention: RelationMention = {
     new RelationMention(
@@ -426,10 +453,18 @@ class RelationMention(
     copy(attachments = this.attachments + mod)
   }
 
+  def newWithAttachments(mods: Seq[Attachment]): RelationMention = {
+    copy(attachments = this.attachments ++ mods)
+  }
+
   def newWithoutAttachment(mod: Attachment): RelationMention = {
     copy(attachments = this.attachments - mod)
   }
 
+  def newWithoutAttachments(mods: Seq[Attachment]): RelationMention = {
+    copy(attachments = this.attachments -- mods)
+  }
+
   // Convert a RelationMention to an EventMention by specifying a trigger
   def toEventMention(trigger: TextBoundMention): EventMention = {
 

diff --git a/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala b/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala
@@ -16,7 +16,7 @@ import org.yaml.snakeyaml.constructor.{Constructor, ConstructorException}
 import org.clulab.odin._
 import org.clulab.odin.impl.MarkdownGeneration._
 import org.clulab.utils.FileUtils
-import org.clulab.utils.Closer._
+import org.clulab.utils.Closer.AutoCloser