Skip to content
This repository has been archived by the owner on Feb 15, 2024. It is now read-only.

Added Token/Tokenizer and PostaggedToken/PosTagger Serialization #28

Merged
merged 3 commits into from
Oct 24, 2013
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,23 @@ object PostaggedToken {
}
}
}

object serialization extends Format[PostaggedToken, String]{
def write(postaggedToken: PostaggedToken): String = {
Iterator(Token.serialization.write(postaggedToken),postaggedToken.postag).mkString(" ").replaceAll("\\s+", " ")
}
def read(str: String): PostaggedToken = {
try{
val token = Token.serialization.read(str)
val info = str.split(" ")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would write this as val postag = str.split(" ") match { case Array(postag) => postag }.

You could add a case here to handle the exception and give an informative error too (i.e. case _ => throw new MatchError)

val posTag = info(1)
PostaggedToken.apply(posTag,token.string,token.offset)
}
catch{
case e: Exception => {
throw new MatchError("Could not match PostaggedToken in serialized form")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe append the serialized string. I.e. "...:" + str.

}
}
}
}
}
14 changes: 14 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ object Postagger {
def tokensFrom(postags: Seq[String], tokens: Seq[Token]): Seq[PostaggedToken] = {
(postags zip tokens).map { case (postag, token) => PostaggedToken(token, postag) }
}

object serialization extends Format[Seq[PostaggedToken],String]{

def write(tokens: Seq[PostaggedToken]): String = {
val serializedTokens = for(tok <- tokens) yield {
PostaggedToken.serialization.write(tok)
}
serializedTokens.mkString("\t")
}

def read(string: String): Seq[PostaggedToken] ={
for (str <- string.split("\t")) yield PostaggedToken.serialization.read(str)
}
}
}

abstract class PostaggerMain extends LineProcessor("postagger") {
Expand Down
18 changes: 18 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/tokenize/Token.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,22 @@ object Token {
val splitIndex = string.lastIndexOf('@')
Token(string.take(splitIndex), 0)
}

object serialization extends Format[Token, String]{
def write(token: Token): String = token.string+"@"+token.offset
def read(str: String): Token = {
try{
val info = str.split(" ")
val tokenSplit = info(0).split("@")
val tokenOffset = tokenSplit(1).toInt
val tokenString = tokenSplit(0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

val (tokenOffset, tokenString) = info(0).split("@") match { 
case Array(offset, string) => (offset, string) 
case _ => throw new MatchError("informative message) 
}

Token(tokenString,tokenOffset)
}
catch{
case e: Exception =>{
throw new MatchError("Could not match Token in serialized form")
}
}
}
}
}
10 changes: 10 additions & 0 deletions core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ object Tokenizer {
case s => throw new MatchError("Could not deserialize: " + s)
}(scala.collection.breakOut)
}

object serialization extends Format[Seq[Token],String]{
def write(tokens: Seq[Token]):String = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space before return type

val serializedTokens = for(tok <- tokens) yield Token.serialization.write(tok)
serializedTokens.mkString("\t")
}
def read(str: String): Seq[Token] = {
for (s <- str.split("\t")) yield Token.serialization.read(s)
}
}
}

abstract class TokenizerMain extends LineProcessor("tokenizer") {
Expand Down
40 changes: 39 additions & 1 deletion core/src/test/scala/edu/knowitall/tool/tokenize/TokenSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,52 @@ package tokenize
import org.junit._
import org.junit.Assert._
import org.junit.runner.RunWith

import org.specs2.mutable.Specification
import org.specs2.runner.JUnitRunner
import edu.knowitall.tool.postag.PostaggedToken
import edu.knowitall.tool.postag.Postagger

@RunWith(classOf[JUnitRunner])
object TokenSpecTest extends Specification {
"tokens serialize and deserialize correctly" in {
val token = Token("asdf", 0)
Token.deserialize(token.serialize) == token

val t = Token("asdf",0)
val tSerialized = Token.serialization.write(t)
val tDeserialized = Token.serialization.read(tSerialized)
tDeserialized == t
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use === so you get better error messages when it fails.


val pt = PostaggedToken("DT","in",3)
PostaggedToken.serialization.read(PostaggedToken.serialization.write(pt)) == pt
}

"tokenizer serialization and deserialization work correctly" in {

val token1 = Token("The",0)
val token2 = Token("big",4)
val tokens = Seq(token1,token2)
val tokensSerialization = Tokenizer.serialization.write(tokens)
Tokenizer.serialization.read(tokensSerialization) == tokens

}

"posTaggedTokenizer serialization and deserialization work correctly" in {
val posToken1 = PostaggedToken("DT","The",0)
val posToken2 = PostaggedToken("JJ","big",4)
val posTokens = Seq(posToken1,posToken2)
val posTokensSerialization = Postagger.serialization.write(posTokens)
Postagger.serialization.read(posTokensSerialization) == posTokens
}

"deserializing Tokens from posTagger serialization works" in {
val posToken1 = PostaggedToken("DT","The",0)
val token1 = Token("The",0)
val posToken2 = PostaggedToken("JJ","big",4)
val token2 = Token("big",4)
val posTokens = Seq(posToken1,posToken2)
val tokens = Seq(token1,token2)
val posTokensSerialization = Postagger.serialization.write(posTokens)
Tokenizer.serialization.read(posTokensSerialization) == tokens
}
}