Skip to content

Commit

Permalink
Merge branch 'master' into masha-norms
Browse files Browse the repository at this point in the history
  • Loading branch information
maxaalexeeva committed Jul 15, 2022
2 parents 526bf2a + 8850190 commit 9b7335c
Show file tree
Hide file tree
Showing 29 changed files with 392 additions and 153 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ Makefile
.bloop/
.metals/
.vscode/
project/

# sbt specific
.bsp
dist/*
target/
lib_managed/
Expand Down
5 changes: 4 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
+ **8.5.1** - Avoid empty words
+ **8.5.2** - Processors no longer uses Scala logging nor does it publish its runtime use of logback so that it isn't forced onto clients. logback is still included in ivy.xml for publishLocal, however.
+ **8.5.1** - Normalize years with relative seasons and one-token numerical ranges
+ **8.5.1** - Avoid empty words from tokenizer, shorten evaluation filenames, add progress bars
+ **8.5.1** - Recognize date from Month/Month to Month/Month
+ **8.5.0** - Small parser improvements; added model for case restoration
+ **8.4.9** - Parser improvements: slimmer models; parsing using the Eisner algorithm
+ **8.4.9** - Removed support for embedding spans in Metal
Expand Down
4 changes: 2 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
val scala11 = "2.11.12" // up to 2.11.12
val scala12 = "2.12.13" // up to 2.12.13
val scala13 = "2.13.5" // up to 2.13.5
val scala12 = "2.12.15" // up to 2.12.15
val scala13 = "2.13.8" // up to 2.13.8
// scala13 is waiting on ai.lum %% common.

ThisBuild / crossScalaVersions := Seq(scala12, scala11)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,15 +354,15 @@ object ShallowNLPProcessor {
// character offsets and actual text
val sentStartOffset = sentence.startOffsets.head
val sentEndOffset = sentence.endOffsets.last
crtSent.set(classOf[CharacterOffsetBeginAnnotation], new Integer(sentStartOffset))
crtSent.set(classOf[CharacterOffsetEndAnnotation], new Integer(sentEndOffset))
crtSent.set(classOf[CharacterOffsetBeginAnnotation], Integer.valueOf(sentStartOffset))
crtSent.set(classOf[CharacterOffsetEndAnnotation], Integer.valueOf(sentEndOffset))
crtSent.set(classOf[TextAnnotation], doc.text.get.substring(sentStartOffset, sentEndOffset))

// token and sentence offsets
crtSent.set(classOf[TokenBeginAnnotation], new Integer(tokenOffset))
crtSent.set(classOf[TokenBeginAnnotation], Integer.valueOf(tokenOffset))
tokenOffset += crtTokens.size()
crtSent.set(classOf[TokenEndAnnotation], new Integer(tokenOffset))
crtSent.set(classOf[SentenceIndexAnnotation], new Integer(sentOffset)) // Stanford counts sentences starting from 0
crtSent.set(classOf[TokenEndAnnotation], Integer.valueOf(tokenOffset))
crtSent.set(classOf[SentenceIndexAnnotation], Integer.valueOf(sentOffset)) // Stanford counts sentences starting from 0

sentencesAnnotation.add(crtSent)
sentOffset += 1
Expand Down
2 changes: 1 addition & 1 deletion docs/Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ GEM
jekyll-seo-tag (~> 2.1)
minitest (5.14.3)
multipart-post (2.1.1)
nokogiri (1.13.4-x86_64-linux)
nokogiri (1.13.6-x86_64-linux)
racc (~> 1.4)
octokit (4.20.0)
faraday (>= 0.9)
Expand Down
11 changes: 7 additions & 4 deletions main/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pomIncludeRepository := { (repo: MavenRepository) =>
}

// for processors-models
resolvers += "Artifactory" at "http://artifactory.cs.arizona.edu:8081/artifactory/sbt-release"
resolvers += ("Artifactory" at "http://artifactory.cs.arizona.edu:8081/artifactory/sbt-release").withAllowInsecureProtocol(true)

libraryDependencies ++= {
val json4sVersion = "3.5.2"
Expand All @@ -29,11 +29,14 @@ libraryDependencies ++= {
"org.clulab" % "lemport" % "0.9.10", // Portuguese lemmatizer
"de.jollyday" % "jollyday" % "0.5.10", // for holidays normalization
// logging
"ch.qos.logback" % "logback-classic" % "1.2.8", // up to 1.2.8; less than 1.2 is vulnerable
"com.typesafe.scala-logging" %% "scala-logging" % "3.7.2",
// The Scala interface is not used in processors.
// "com.typesafe.scala-logging" %% "scala-logging" % "3.7.2",
// Instead, all code makes use of the Java interface.
"org.slf4j" % "slf4j-api" % "1.7.10",
// Local logging is provided here and not published.
"ch.qos.logback" % "logback-classic" % "1.2.8", // up to 1.2.8; less than 1.2 is vulnerable
// testing
"org.scalatest" %% "scalatest" % "3.0.1" % Test,
"org.scalatest" %% "scalatest" % "3.0.1" % Test,
// trained models for local ML models used in both main and corenlp
// These are stored in the CLU lab Artifactory not maven!
"org.clulab" % "glove-840b-300d-10f-kryo" % "1.0.0",
Expand Down
12 changes: 7 additions & 5 deletions main/src/main/scala/org/clulab/dynet/Eisner.scala
Original file line number Diff line number Diff line change
Expand Up @@ -237,24 +237,26 @@ class Eisner {
dep.head - 1
}
val label = dep.label
heads(dep.mod - 1) = Tuple2(head, label)
heads(dep.mod - 1) = (head, label)
}
} else {
// Eisner failed to produce a complete tree; revert to the greedy inference
for(i <- scores.indices) {
val relativeHead = scores(i).maxBy(_._2)._1.toInt
val depMod = i + 1
val depHead = if (relativeHead == 0) 0 else depMod + relativeHead
val label = dependencies(depMod)(depHead).label
// lift() checks the index, and Option(_) checks for nulls.
val valid = dependencies(depMod).lift(depHead).flatMap(Option(_)).isDefined
val label = if (valid) dependencies(depMod)(depHead).label else "root"
val head =
if(generateRelativeHeads) {
// we are storing *relative* head positions here
relativeHead
if (valid) relativeHead else 0
} else {
// we are storing absolute heads, starting at offset 0
depHead - 1
if (valid) depHead - 1 else -1
}
heads(i) = Tuple2(head, label)
heads(i) = (head, label)
}
}
heads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package org.clulab.embeddings
import org.clulab.utils.Closer.AutoCloser
import org.clulab.utils.InputStreamer
import org.clulab.utils.InputStreamer.StreamResult
import org.clulab.utils.ThreadUtils.NamedFuture

import scala.collection.mutable
import scala.concurrent.Await
Expand All @@ -11,7 +12,6 @@ import scala.concurrent.duration.Duration

/** Manages a pool of word embedding maps, so we do not load them more than once */
object WordEmbeddingMapPool {
import scala.concurrent.ExecutionContext.Implicits.global

case class Key(name: String, compact: Boolean)

Expand All @@ -34,19 +34,22 @@ object WordEmbeddingMapPool {

/** Fetches an embedding from the pool if it exists, or creates it otherwise */
def getOrElseCreate(name: String, compact: Boolean = false, fileLocation: String = "", resourceLocation: String = ""): WordEmbeddingMap = {
// Using the global execution context may be a bad strategy.
import scala.concurrent.ExecutionContext.Implicits.global

val wordEmbeddingMapFuture =
if (enabled)
this.synchronized {
// Access the shared pool inside the synchronized section.
pool.getOrElseUpdate(
Key(name, compact),
Future {
NamedFuture("enabled WordEmbeddingMapPool.loadEmbedding") {
loadEmbedding(name, fileLocation, resourceLocation, compact = compact)
}
)
}
else
Future {
NamedFuture("disabled WordEmbeddingMapPool.loadEmbedding") {
loadEmbedding(name, fileLocation, resourceLocation, compact = compact)
}
// Wait for the result outside the synchronized section.
Expand Down
40 changes: 22 additions & 18 deletions main/src/main/scala/org/clulab/learning/Dataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ import org.clulab.struct.Lexicon

import scala.io.{BufferedSource, Source}
import java.util.zip.GZIPInputStream
import java.io.{BufferedInputStream, FileInputStream, FileWriter, PrintWriter}

import org.slf4j.LoggerFactory
import java.io.{FileWriter, PrintWriter}
import org.slf4j.{Logger, LoggerFactory}
import RVFDataset._
import org.clulab.utils.Files

Expand All @@ -22,19 +21,19 @@ import org.clulab.utils.Files
abstract class Dataset[L, F](
val labelLexicon:Lexicon[L],
val featureLexicon:Lexicon[F],
val labels:ArrayBuffer[Int]) extends Serializable {
val labels:ArrayBuffer[Int]) extends IndexedSeq[Datum[L, F]] with Serializable {

def this() = this(new Lexicon[L], new Lexicon[F], new ArrayBuffer[Int])

def += (datum:Datum[L, F]): Unit

def numFeatures = featureLexicon.size
def numLabels = labelLexicon.size
def numFeatures: Int = featureLexicon.size
def numLabels: Int = labelLexicon.size

/** number of training examples */
def size = labels.size
override def size: Int = labels.size

def indices = 0 until size
override def indices: Range = labels.indices

def featuresCounter(datumOffset:Int):Counter[Int]

Expand All @@ -52,6 +51,10 @@ abstract class Dataset[L, F](

/** Convert this dataset to a CounterDataset */
def toCounterDataset:CounterDataset[L, F]

override def length: Int = size

override def apply(idx: Int): Datum[L, F] = mkDatum(idx)
}

/**
Expand Down Expand Up @@ -124,7 +127,8 @@ class BVFDataset[L, F] (

// sort all features in descending order of their IG
val fb = new ListBuffer[(Int, Double)]
for(f <- igs.keySet) fb += new Tuple2(f, igs.get(f).get.ig(total))
// The (( here and elsewhere in this file is for Scala 2.11.
for(f <- igs.keySet) fb += ((f, igs(f).ig(total)))
val sortedFeats = fb.sortBy(- _._2).toArray

// keep the top pctToKeep
Expand Down Expand Up @@ -197,7 +201,7 @@ class BVFDataset[L, F] (
for(i <- feats.indices) {
val f = feats(i)
if(featureIndexMap.contains(f)) {
newFeats += featureIndexMap.get(f).get
newFeats += featureIndexMap(f)
}
}

Expand Down Expand Up @@ -248,7 +252,7 @@ class RVFDataset[L, F] (
private def featuresCounterToArray(fs:Counter[F]):Array[(Int, Double)] = {
val fb = new ListBuffer[(Int, Double)]
for(f <- fs.keySet) {
fb += new Tuple2[Int, Double](featureLexicon.add(f), fs.getCount(f))
fb += ((featureLexicon.add(f), fs.getCount(f)))
}
fb.sortBy(_._1).toArray
}
Expand Down Expand Up @@ -327,7 +331,7 @@ class RVFDataset[L, F] (
val f = feats(i)
val v = vals(i)
if(featureIndexMap.contains(f)) {
newFeats += featureIndexMap.get(f).get
newFeats += featureIndexMap(f)
newVals += v
}
}
Expand Down Expand Up @@ -370,12 +374,12 @@ class InformationGain( var datumCount:Int = 0,
pos + neg
}

def pWith(total:InformationGain) = datumCount.toDouble / total.datumCount.toDouble
def pWithout(total:InformationGain) = (total.datumCount - datumCount).toDouble / total.datumCount.toDouble
def pWith(total:InformationGain): Double = datumCount.toDouble / total.datumCount.toDouble
def pWithout(total:InformationGain): Double = (total.datumCount - datumCount).toDouble / total.datumCount.toDouble
}

object RVFDataset {
val logger = LoggerFactory.getLogger(classOf[RVFDataset[String, String]])
val logger: Logger = LoggerFactory.getLogger(classOf[RVFDataset[String, String]])

def mkDatasetFromSvmLightResource(path: String): RVFDataset[Int, String] = {
val stream = getClass.getClassLoader.getResourceAsStream(path)
Expand Down Expand Up @@ -413,7 +417,7 @@ object RVFDataset {
content = content.trim
// logger.debug("Parsing line: [" + content + "]")

if(content.length > 0) {
if(content.nonEmpty) {
val bits = content.split("\\s+")

var label = bits(0)
Expand Down Expand Up @@ -451,7 +455,7 @@ object RVFDataset {
val fi = featureLexicon.get(k)
if(fi.isDefined) {
// logger.debug(s"Feature [$k] converted to index ${fi.get + 1}")
fs += new Tuple2(fi.get + 1, c.getCount(k))
fs += ((fi.get + 1, c.getCount(k)))
}
}
val fss = fs.toList.sortBy(_._1)
Expand Down Expand Up @@ -498,7 +502,7 @@ object RVFDataset {
content = content.trim
//logger.debug("Parsing line: " + content)

if(content.length > 0) {
if(content.nonEmpty) {
val bits = content.split("\\s+")

var label = bits(0)
Expand Down
35 changes: 35 additions & 0 deletions main/src/main/scala/org/clulab/odin/Mention.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,24 @@ trait Mention extends Equals with Ordered[Mention] with Serializable {
case m: EventMention => m.newWithAttachment(mod)
}

def withAttachments(mods: Seq[Attachment]): Mention = this match {
case m: TextBoundMention => m.newWithAttachments(mods)
case m: RelationMention => m.newWithAttachments(mods)
case m: EventMention => m.newWithAttachments(mods)
}

def withoutAttachment(mod: Attachment): Mention = this match {
case m: TextBoundMention => m.newWithoutAttachment(mod)
case m: RelationMention => m.newWithoutAttachment(mod)
case m: EventMention => m.newWithoutAttachment(mod)
}

def withoutAttachments(mods: Seq[Attachment]): Mention = this match {
case m: TextBoundMention => m.newWithoutAttachments(mods)
case m: RelationMention => m.newWithoutAttachments(mods)
case m: EventMention => m.newWithoutAttachments(mods)
}

val paths: Map[String, Map[Mention, SynPath]]

def getPath(argRole: String, mention: Mention): SynPath = paths(argRole)(mention)
Expand Down Expand Up @@ -236,10 +248,17 @@ class TextBoundMention(
copy(attachments = this.attachments + mod)
}

def newWithAttachments(mods: Seq[Attachment]): TextBoundMention = {
copy(attachments = this.attachments ++ mods)
}

def newWithoutAttachment(mod: Attachment): TextBoundMention = {
copy(attachments = this.attachments - mod)
}

def newWithoutAttachments(mods: Seq[Attachment]): TextBoundMention = {
copy(attachments = this.attachments -- mods)
}
}

// NOTE that event mentions *may* have no arguments
Expand Down Expand Up @@ -324,10 +343,18 @@ class EventMention(
copy(attachments = this.attachments + mod)
}

def newWithAttachments(mods: Seq[Attachment]): EventMention = {
copy(attachments = this.attachments ++ mods)
}

def newWithoutAttachment(mod: Attachment): EventMention = {
copy(attachments = this.attachments - mod)
}

def newWithoutAttachments(mods: Seq[Attachment]): EventMention = {
copy(attachments = this.attachments -- mods)
}

// Convert an EventMention to a RelationMention by deleting the trigger
def toRelationMention: RelationMention = {
new RelationMention(
Expand Down Expand Up @@ -426,10 +453,18 @@ class RelationMention(
copy(attachments = this.attachments + mod)
}

def newWithAttachments(mods: Seq[Attachment]): RelationMention = {
copy(attachments = this.attachments ++ mods)
}

def newWithoutAttachment(mod: Attachment): RelationMention = {
copy(attachments = this.attachments - mod)
}

def newWithoutAttachments(mods: Seq[Attachment]): RelationMention = {
copy(attachments = this.attachments -- mods)
}

// Convert a RelationMention to an EventMention by specifying a trigger
def toEventMention(trigger: TextBoundMention): EventMention = {

Expand Down
2 changes: 1 addition & 1 deletion main/src/main/scala/org/clulab/odin/impl/RuleReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import org.yaml.snakeyaml.constructor.{Constructor, ConstructorException}
import org.clulab.odin._
import org.clulab.odin.impl.MarkdownGeneration._
import org.clulab.utils.FileUtils
import org.clulab.utils.Closer._
import org.clulab.utils.Closer.AutoCloser



Expand Down
Loading

0 comments on commit 9b7335c

Please sign in to comment.