From 483dfdcd7cc0ca1234b1d6af83126215938aa75f Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 16 Oct 2013 08:29:10 -0700 Subject: [PATCH 1/2] Add Format class and a brat implementation. --- core/src/main/scala/edu/knowitall/tool/Format.scala | 12 ++++++++++++ .../scala/edu/knowitall/tool/postag/Postagger.scala | 13 +++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 core/src/main/scala/edu/knowitall/tool/Format.scala diff --git a/core/src/main/scala/edu/knowitall/tool/Format.scala b/core/src/main/scala/edu/knowitall/tool/Format.scala new file mode 100644 index 0000000..07de1aa --- /dev/null +++ b/core/src/main/scala/edu/knowitall/tool/Format.scala @@ -0,0 +1,12 @@ +package edu.knowitall.tool + +trait Writer[F, T] { + def write(from: F): T +} + +trait Reader[F, T] { + def read(from: F): T +} + +trait Format[F, T] +extends Writer[F, T] with Reader[T, F] diff --git a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala index 833d781..e843e49 100644 --- a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala +++ b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala @@ -64,6 +64,19 @@ object Postagger { def tokensFrom(postags: Seq[String], tokens: Seq[Token]): Seq[PostaggedToken] = { (postags zip tokens).map { case (postag, token) => PostaggedToken(token, postag) } } + + object bratFormat extends Format[PostaggedToken, String] { + def write(token: PostaggedToken): String = { + Iterator(token.postag, token.offset, token.offsets.end, token.string).mkString("\t") + } + + def read(string: String): PostaggedToken = { + string.split("\t") match { + case Array(postag, offset, _, token) => PostaggedToken(postag, token, offset.toInt) + case _ => throw new MatchError("Could not match BRAT PostaggedToken: " + string) + } + } + } } abstract class PostaggerMain extends LineProcessor("postagger") { From 34c33b256bccef449fe8ce93170aeea327df6345 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 16 Oct 2013 10:17:11 -0700 Subject: [PATCH 2/2] Move serialization to PostaggedToken. --- .../tool/postag/PostaggedToken.scala | 21 +++++++++++++++++-- .../edu/knowitall/tool/postag/Postagger.scala | 13 ------------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala b/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala index 59a4b85..617ff39 100644 --- a/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala +++ b/core/src/main/scala/edu/knowitall/tool/postag/PostaggedToken.scala @@ -1,5 +1,5 @@ -package edu.knowitall -package tool.postag +package edu.knowitall.tool +package postag import edu.knowitall.common.HashCodeHelper import edu.knowitall.tool.tokenize.Token @@ -78,4 +78,21 @@ object PostaggedToken { def apply(token: Token, postag: String): PostaggedToken = PostaggedToken(postag, token.string, token.offset) def unapply(token: PostaggedToken): Option[(String, String, Int)] = Some((token.postag, token.string, token.offset)) + + object bratFormat extends Format[PostaggedToken, String] { + def write(token: PostaggedToken): String = { + Iterator(token.postag + " " + token.offset + " " + token.offsets.end, token.string).mkString("\t") + } + + def read(string: String): PostaggedToken = { + string.split("\t") match { + case Array(meat, token) => + meat.split("\\s+") match { + case Array(postag, token, offset) => PostaggedToken(postag, token, offset.toInt) + case _ => throw new MatchError("Could not match BRAT PostaggedToken: " + string) + } + case _ => throw new MatchError("Could not match BRAT PostaggedToken: " + string) + } + } + } } diff --git a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala index e843e49..833d781 100644 --- a/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala +++ b/core/src/main/scala/edu/knowitall/tool/postag/Postagger.scala @@ -64,19 +64,6 @@ object Postagger { def tokensFrom(postags: Seq[String], tokens: Seq[Token]): Seq[PostaggedToken] = { (postags zip tokens).map { case (postag, token) => PostaggedToken(token, postag) } } - - object bratFormat extends Format[PostaggedToken, String] { - def write(token: PostaggedToken): String = { - Iterator(token.postag, token.offset, token.offsets.end, token.string).mkString("\t") - } - - def read(string: String): PostaggedToken = { - string.split("\t") match { - case Array(postag, offset, _, token) => PostaggedToken(postag, token, offset.toInt) - case _ => throw new MatchError("Could not match BRAT PostaggedToken: " + string) - } - } - } } abstract class PostaggerMain extends LineProcessor("postagger") {