This repository has been archived by the owner on Dec 1, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathCMUTweetTagger.py
99 lines (82 loc) · 4.43 KB
/
CMUTweetTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Simple Python wrapper for runTagger.sh script for CMU's Tweet Tokeniser and Part of Speech tagger: http://www.ark.cs.cmu.edu/TweetNLP/
Usage:
results=runtagger_parse(['example tweet 1', 'example tweet 2'])
results will contain a list of lists (one per tweet) of triples, each triple represents (term, type, confidence)
"""
import subprocess
import shlex
# The only relavent source I've found is here:
# http://m1ked.com/post/12304626776/pos-tagger-for-twitter-successfully-implemented-in
# which is a very simple implementation, my implementation is a bit more
# useful (but not much).
# NOTE this command is directly lifted from runTagger.sh
RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar ark-tweet-nlp-0.3.2.jar"
def _split_results(rows):
"""Parse the tab-delimited returned lines, modified from: https://github.com/brendano/ark-tweet-nlp/blob/master/scripts/show.py"""
for line in rows:
line = line.strip() # remove '\n'
if len(line) > 0:
if line.count('\t') == 2:
parts = line.split('\t')
tokens = parts[0]
tags = parts[1]
confidence = float(parts[2])
yield tokens, tags, confidence
def _call_runtagger(tweets, run_tagger_cmd=RUN_TAGGER_CMD):
"""Call runTagger.sh using a named input file"""
# remove carriage returns as they are tweet separators for the stdin
# interface
tweets_cleaned = [tw.replace('\n', ' ') for tw in tweets]
message = "\n".join(tweets_cleaned)
# force UTF-8 encoding (from internal unicode type) to avoid .communicate encoding error as per:
# http://stackoverflow.com/questions/3040101/python-encoding-for-pipe-communicate
message = message.encode('utf-8')
# build a list of args
args = shlex.split(run_tagger_cmd)
args.append('--output-format')
args.append('conll')
po = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# old call - made a direct call to runTagger.sh (not Windows friendly)
#po = subprocess.Popen([run_tagger_cmd, '--output-format', 'conll'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
result = po.communicate(message)
# expect a tuple of 2 items like:
# ('hello\t!\t0.9858\nthere\tR\t0.4168\n\n',
# 'Listening on stdin for input. (-h for help)\nDetected text input format\nTokenized and tagged 1 tweets (2 tokens) in 7.5 seconds: 0.1 tweets/sec, 0.3 tokens/sec\n')
pos_result = result[0].decode('utf-8').strip('\n\n') # get first line, remove final double carriage return
pos_result = pos_result.split('\n\n') # split messages by double carriage returns
pos_results = [pr.split('\n') for pr in pos_result] # split parts of message by each carriage return
return pos_results
def runtagger_parse(tweets, run_tagger_cmd=RUN_TAGGER_CMD):
"""Call runTagger.sh on a list of tweets, parse the result, return lists of tuples of (term, type, confidence)"""
pos_raw_results = _call_runtagger(tweets, run_tagger_cmd)
pos_result = []
for pos_raw_result in pos_raw_results:
pos_result.append([x for x in _split_results(pos_raw_result)])
return pos_result
def check_script_is_present(run_tagger_cmd=RUN_TAGGER_CMD):
"""Simple test to make sure we can see the script"""
success = False
try:
args = shlex.split(run_tagger_cmd)
args.append("--help")
po = subprocess.Popen(args, stdout=subprocess.PIPE)
# old call - made a direct call to runTagger.sh (not Windows friendly)
#po = subprocess.Popen([run_tagger_cmd, '--help'], stdout=subprocess.PIPE)
while not po.poll():
lines = [l for l in po.stdout]
# we expected the first line of --help to look like the following:
assert "RunTagger [options]" in lines[0].decode('utf-8')
success = True
except OSError as err:
print("Caught an OSError, have you specified the correct path to runTagger.sh? We are using \"%s\". Exception: %r" % (run_tagger_cmd, repr(err)))
return success
if __name__ == "__main__":
print("Checking that we can see \"%s\", this will crash if we can't" % (RUN_TAGGER_CMD))
success = check_script_is_present()
if success:
print("Success.")
print("Now pass in two messages, get a list of tuples back:")
tweets = ['this is a message', 'and a second message']
print(runtagger_parse(tweets))