diff --git a/core/src/main/scala/edu/knowitall/tool/chunk/ChunkedToken.scala b/core/src/main/scala/edu/knowitall/tool/chunk/ChunkedToken.scala index 8f409d3..7caa868 100644 --- a/core/src/main/scala/edu/knowitall/tool/chunk/ChunkedToken.scala +++ b/core/src/main/scala/edu/knowitall/tool/chunk/ChunkedToken.scala @@ -3,6 +3,7 @@ package tool.chunk import edu.knowitall.common.HashCodeHelper import edu.knowitall.tool.postag.PostaggedToken +import edu.knowitall.tool.Format /** A representation of a chunked token. A chunked token has all the * aspects of a postagged token along with a chunk tag. @@ -38,4 +39,24 @@ object ChunkedToken { new ChunkedToken(Symbol(chunk), token.postagSymbol, token.string, token.offset) def unapply(token: ChunkedToken): Option[(String, String, String, Int)] = Some((token.chunk, token.postag, token.string, token.offset)) + + object stringFormat extends Format[ChunkedToken, String] { + def write(chunkedToken: ChunkedToken): String = { + Iterator(PostaggedToken.stringFormat.write(chunkedToken),chunkedToken.chunk).mkString(" ").replaceAll("\\s+", " ") + } + def read(str: String): ChunkedToken = { + try{ + val postaggedToken = PostaggedToken.stringFormat.read(str) + val info = str.split(" ") + val chunkName = info(2) + ChunkedToken(chunkName,postaggedToken.postag,postaggedToken.string,postaggedToken.offset) + } + catch{ + case e: Exception => { + throw new MatchError("Error parsing ChunkedToken format token@offset postag chunkName for " + + "the serialized string " + str) + } + } + } + } } diff --git a/core/src/main/scala/edu/knowitall/tool/chunk/Chunker.scala b/core/src/main/scala/edu/knowitall/tool/chunk/Chunker.scala index 40a410f..792e9dd 100644 --- a/core/src/main/scala/edu/knowitall/tool/chunk/Chunker.scala +++ b/core/src/main/scala/edu/knowitall/tool/chunk/Chunker.scala @@ -87,6 +87,18 @@ object Chunker { val postaggedTokens = Postagger.tokensFrom(postags, tokens) (chunks zip postaggedTokens).map { case (chunk, postaggedToken) => ChunkedToken(postaggedToken, chunk) } } + + object stringFormat extends Format[Seq[ChunkedToken], String]{ + def write(chunkedTokens: Seq[ChunkedToken]): String = { + val serializedChunkedTokens = for(chunkedTok <- chunkedTokens) yield { + ChunkedToken.stringFormat.write(chunkedTok) + } + serializedChunkedTokens.mkString("\t") + } + def read(str: String): Seq[ChunkedToken] = { + for (s <- str.split("\t")) yield ChunkedToken.stringFormat.read(s) + } + } } abstract class ChunkerMain diff --git a/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala b/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala index 617ff39..6b1d89e 100644 --- a/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala +++ b/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala @@ -95,4 +95,24 @@ object PostaggedToken { } } } + + object stringFormat extends Format[PostaggedToken, String] { + def write(postaggedToken: PostaggedToken): String = { + Iterator(Token.stringFormat.write(postaggedToken),postaggedToken.postag).mkString(" ").replaceAll("\\s+", " ") + } + def read(str: String): PostaggedToken = { + try{ + val token = Token.stringFormat.read(str) + val info = str.split(" ") + val posTag = info(1) + PostaggedToken(posTag,token.string,token.offset) + } + catch{ + case e: Exception => { + throw new MatchError("Error parsing PostaggedToken format token@offset postag for " + + "the serialized string " + str) + } + } + } + } } diff --git a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala index 833d781..0814105 100644 --- a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala +++ b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala @@ -64,6 +64,20 @@ object Postagger { def tokensFrom(postags: Seq[String], tokens: Seq[Token]): Seq[PostaggedToken] = { (postags zip tokens).map { case (postag, token) => PostaggedToken(token, postag) } } + + object stringFormat extends Format[Seq[PostaggedToken],String]{ + + def write(tokens: Seq[PostaggedToken]): String = { + val serializedTokens = for(tok <- tokens) yield { + PostaggedToken.stringFormat.write(tok) + } + serializedTokens.mkString("\t") + } + + def read(string: String): Seq[PostaggedToken] ={ + for (str <- string.split("\t")) yield PostaggedToken.stringFormat.read(str) + } + } } abstract class PostaggerMain extends LineProcessor("postagger") { diff --git a/core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala b/core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala index ddf79ab..a7c0085 100644 --- a/core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala +++ b/core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala @@ -38,4 +38,18 @@ object Token { val splitIndex = string.lastIndexOf('@') Token(string.take(splitIndex), 0) } + + object stringFormat extends Format[Token, String]{ + def write(token: Token): String = token.string+"@"+token.offset + def read(str: String): Token = { + val info = str.split(" ") + val tokenRegex = "(.+)@(\\d+)".r + val(tokenString, tokenOffset) = info(0) match{ + case tokenRegex(string,offset) => (string, offset) + case _ => throw new MatchError("Error parsing token format token@offset for token " + info(0) + + " in this serialized string " + str) + } + Token(tokenString,tokenOffset.toInt) + } + } } diff --git a/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala b/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala index 3ec4fc1..54668e5 100644 --- a/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala +++ b/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala @@ -69,6 +69,16 @@ object Tokenizer { case s => throw new MatchError("Could not deserialize: " + s) }(scala.collection.breakOut) } + + object stringFormat extends Format[Seq[Token],String]{ + def write(tokens: Seq[Token]): String = { + val serializedTokens = for(tok <- tokens) yield Token.stringFormat.write(tok) + serializedTokens.mkString("\t") + } + def read(str: String): Seq[Token] = { + for (s <- str.split("\t")) yield Token.stringFormat.read(s) + } + } } abstract class TokenizerMain extends LineProcessor("tokenizer") { diff --git a/core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala b/core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala index 6b9a5ae..2791f99 100644 --- a/core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala +++ b/core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala @@ -5,14 +5,77 @@ package tokenize import org.junit._ import org.junit.Assert._ import org.junit.runner.RunWith - import org.specs2.mutable.Specification import org.specs2.runner.JUnitRunner +import edu.knowitall.tool.postag.PostaggedToken +import edu.knowitall.tool.postag.Postagger +import edu.knowitall.tool.chunk.ChunkedToken +import edu.knowitall.tool.chunk.Chunker @RunWith(classOf[JUnitRunner]) object TokenSpecTest extends Specification { "tokens serialize and deserialize correctly" in { val token = Token("asdf", 0) - Token.deserialize(token.serialize) == token + Token.deserialize(token.serialize) === token + + val t = Token("asdf",0) + val tSerialized = Token.stringFormat.write(t) + val tDeserialized = Token.stringFormat.read(tSerialized) + tDeserialized === t + + val pt = PostaggedToken("DT","in",3) + PostaggedToken.stringFormat.read(PostaggedToken.stringFormat.write(pt)) == pt + } + + "tokenizer serialization and deserialization work correctly" in { + + val token1 = Token("The",0) + val token2 = Token("big",4) + val tokens = Seq(token1,token2) + val tokensSerialization = Tokenizer.stringFormat.write(tokens) + Tokenizer.stringFormat.read(tokensSerialization) === tokens + + } + + "posTagger serialization and deserialization work correctly" in { + val posToken1 = PostaggedToken("DT","The",0) + val posToken2 = PostaggedToken("JJ","big",4) + val posTokens = Seq(posToken1,posToken2) + val posTokensSerialization = Postagger.stringFormat.write(posTokens) + Postagger.stringFormat.read(posTokensSerialization) === posTokens + } + + "deserializing Tokens from posTagger serialization works" in { + val posToken1 = PostaggedToken("DT","The",0) + val token1 = Token("The",0) + val posToken2 = PostaggedToken("JJ","big",4) + val token2 = Token("big",4) + val posTokens = Seq(posToken1,posToken2) + val tokens = Seq(token1,token2) + val posTokensSerialization = Postagger.stringFormat.write(posTokens) + Tokenizer.stringFormat.read(posTokensSerialization) === tokens + } + + "chunker/chunkedToken serialization and deserialization work correctly" in { + val chunkedToken1 = ChunkedToken("NP-DT","DT","The",0) + val chunkedToken2 = ChunkedToken("NP-JJ","JJ","big",4) + val chunkedTokens = Seq(chunkedToken1,chunkedToken2) + val serializedChunkedTokens = Chunker.stringFormat.write(chunkedTokens) + Chunker.stringFormat.read(serializedChunkedTokens) === chunkedTokens + } + + "MatchError should be thrown when String has a non-integer offset" in { + val tokenSerializationString = "The@nonInteger" + try{ + val token = Token.stringFormat.read(tokenSerializationString) + } + catch{ + case e: MatchError => { } + } + } + + "Token should be able to contain @" in { + val t = Token("@",0) + Token.stringFormat.read(Token.stringFormat.write(t)) == t } }