Skip to content
This repository has been archived by the owner on Feb 15, 2024. It is now read-only.

Commit

Permalink
Merge pull request #28 from jgilme1/moreSerialization
Browse files Browse the repository at this point in the history
Added Token/Tokenizer and PostaggedToken/PosTagger Serialization
  • Loading branch information
schmmd committed Oct 24, 2013
2 parents daca5e9 + 1673acb commit 976b95a
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 2 deletions.
21 changes: 21 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/chunk/ChunkedToken.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package tool.chunk

import edu.knowitall.common.HashCodeHelper
import edu.knowitall.tool.postag.PostaggedToken
import edu.knowitall.tool.Format

/** A representation of a chunked token. A chunked token has all the
* aspects of a postagged token along with a chunk tag.
Expand Down Expand Up @@ -38,4 +39,24 @@ object ChunkedToken {
new ChunkedToken(Symbol(chunk), token.postagSymbol, token.string, token.offset)

def unapply(token: ChunkedToken): Option[(String, String, String, Int)] = Some((token.chunk, token.postag, token.string, token.offset))

object stringFormat extends Format[ChunkedToken, String] {
def write(chunkedToken: ChunkedToken): String = {
Iterator(PostaggedToken.stringFormat.write(chunkedToken),chunkedToken.chunk).mkString(" ").replaceAll("\\s+", " ")
}
def read(str: String): ChunkedToken = {
try{
val postaggedToken = PostaggedToken.stringFormat.read(str)
val info = str.split(" ")
val chunkName = info(2)
ChunkedToken(chunkName,postaggedToken.postag,postaggedToken.string,postaggedToken.offset)
}
catch{
case e: Exception => {
throw new MatchError("Error parsing ChunkedToken format token@offset postag chunkName for " +
"the serialized string " + str)
}
}
}
}
}
12 changes: 12 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/chunk/Chunker.scala
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,18 @@ object Chunker {
val postaggedTokens = Postagger.tokensFrom(postags, tokens)
(chunks zip postaggedTokens).map { case (chunk, postaggedToken) => ChunkedToken(postaggedToken, chunk) }
}

object stringFormat extends Format[Seq[ChunkedToken], String]{
def write(chunkedTokens: Seq[ChunkedToken]): String = {
val serializedChunkedTokens = for(chunkedTok <- chunkedTokens) yield {
ChunkedToken.stringFormat.write(chunkedTok)
}
serializedChunkedTokens.mkString("\t")
}
def read(str: String): Seq[ChunkedToken] = {
for (s <- str.split("\t")) yield ChunkedToken.stringFormat.read(s)
}
}
}

abstract class ChunkerMain
Expand Down
20 changes: 20 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,24 @@ object PostaggedToken {
}
}
}

object stringFormat extends Format[PostaggedToken, String] {
def write(postaggedToken: PostaggedToken): String = {
Iterator(Token.stringFormat.write(postaggedToken),postaggedToken.postag).mkString(" ").replaceAll("\\s+", " ")
}
def read(str: String): PostaggedToken = {
try{
val token = Token.stringFormat.read(str)
val info = str.split(" ")
val posTag = info(1)
PostaggedToken(posTag,token.string,token.offset)
}
catch{
case e: Exception => {
throw new MatchError("Error parsing PostaggedToken format token@offset postag for " +
"the serialized string " + str)
}
}
}
}
}
14 changes: 14 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ object Postagger {
def tokensFrom(postags: Seq[String], tokens: Seq[Token]): Seq[PostaggedToken] = {
(postags zip tokens).map { case (postag, token) => PostaggedToken(token, postag) }
}

object stringFormat extends Format[Seq[PostaggedToken],String]{

def write(tokens: Seq[PostaggedToken]): String = {
val serializedTokens = for(tok <- tokens) yield {
PostaggedToken.stringFormat.write(tok)
}
serializedTokens.mkString("\t")
}

def read(string: String): Seq[PostaggedToken] ={
for (str <- string.split("\t")) yield PostaggedToken.stringFormat.read(str)
}
}
}

abstract class PostaggerMain extends LineProcessor("postagger") {
Expand Down
14 changes: 14 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,18 @@ object Token {
val splitIndex = string.lastIndexOf('@')
Token(string.take(splitIndex), 0)
}

object stringFormat extends Format[Token, String]{
def write(token: Token): String = token.string+"@"+token.offset
def read(str: String): Token = {
val info = str.split(" ")
val tokenRegex = "(.+)@(\\d+)".r
val(tokenString, tokenOffset) = info(0) match{
case tokenRegex(string,offset) => (string, offset)
case _ => throw new MatchError("Error parsing token format token@offset for token " + info(0) +
" in this serialized string " + str)
}
Token(tokenString,tokenOffset.toInt)
}
}
}
10 changes: 10 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ object Tokenizer {
case s => throw new MatchError("Could not deserialize: " + s)
}(scala.collection.breakOut)
}

object stringFormat extends Format[Seq[Token],String]{
def write(tokens: Seq[Token]): String = {
val serializedTokens = for(tok <- tokens) yield Token.stringFormat.write(tok)
serializedTokens.mkString("\t")
}
def read(str: String): Seq[Token] = {
for (s <- str.split("\t")) yield Token.stringFormat.read(s)
}
}
}

abstract class TokenizerMain extends LineProcessor("tokenizer") {
Expand Down
67 changes: 65 additions & 2 deletions core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,77 @@ package tokenize
import org.junit._
import org.junit.Assert._
import org.junit.runner.RunWith

import org.specs2.mutable.Specification
import org.specs2.runner.JUnitRunner
import edu.knowitall.tool.postag.PostaggedToken
import edu.knowitall.tool.postag.Postagger
import edu.knowitall.tool.chunk.ChunkedToken
import edu.knowitall.tool.chunk.Chunker

@RunWith(classOf[JUnitRunner])
object TokenSpecTest extends Specification {
"tokens serialize and deserialize correctly" in {
val token = Token("asdf", 0)
Token.deserialize(token.serialize) == token
Token.deserialize(token.serialize) === token

val t = Token("asdf",0)
val tSerialized = Token.stringFormat.write(t)
val tDeserialized = Token.stringFormat.read(tSerialized)
tDeserialized === t

val pt = PostaggedToken("DT","in",3)
PostaggedToken.stringFormat.read(PostaggedToken.stringFormat.write(pt)) == pt
}

"tokenizer serialization and deserialization work correctly" in {

val token1 = Token("The",0)
val token2 = Token("big",4)
val tokens = Seq(token1,token2)
val tokensSerialization = Tokenizer.stringFormat.write(tokens)
Tokenizer.stringFormat.read(tokensSerialization) === tokens

}

"posTagger serialization and deserialization work correctly" in {
val posToken1 = PostaggedToken("DT","The",0)
val posToken2 = PostaggedToken("JJ","big",4)
val posTokens = Seq(posToken1,posToken2)
val posTokensSerialization = Postagger.stringFormat.write(posTokens)
Postagger.stringFormat.read(posTokensSerialization) === posTokens
}

"deserializing Tokens from posTagger serialization works" in {
val posToken1 = PostaggedToken("DT","The",0)
val token1 = Token("The",0)
val posToken2 = PostaggedToken("JJ","big",4)
val token2 = Token("big",4)
val posTokens = Seq(posToken1,posToken2)
val tokens = Seq(token1,token2)
val posTokensSerialization = Postagger.stringFormat.write(posTokens)
Tokenizer.stringFormat.read(posTokensSerialization) === tokens
}

"chunker/chunkedToken serialization and deserialization work correctly" in {
val chunkedToken1 = ChunkedToken("NP-DT","DT","The",0)
val chunkedToken2 = ChunkedToken("NP-JJ","JJ","big",4)
val chunkedTokens = Seq(chunkedToken1,chunkedToken2)
val serializedChunkedTokens = Chunker.stringFormat.write(chunkedTokens)
Chunker.stringFormat.read(serializedChunkedTokens) === chunkedTokens
}

"MatchError should be thrown when String has a non-integer offset" in {
val tokenSerializationString = "The@nonInteger"
try{
val token = Token.stringFormat.read(tokenSerializationString)
}
catch{
case e: MatchError => { }
}
}

"Token should be able to contain @" in {
val t = Token("@",0)
Token.stringFormat.read(Token.stringFormat.write(t)) == t
}
}

0 comments on commit 976b95a

Please sign in to comment.