-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor read_dic to be a bit more strict
Related to Issue #3
- Loading branch information
Showing
1 changed file
with
39 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,45 @@ | ||
def _parse_categories(lines): | ||
""" | ||
Read (category_id, category_name) pairs from the categories section. | ||
Each line consists of an integer followed a tab and then the category name. | ||
This section is separated from the lexicon by a line consisting of a single "%". | ||
""" | ||
for line in lines: | ||
line = line.strip() | ||
if line == "%": | ||
return | ||
# ignore non-matching groups of categories | ||
if "\t" in line: | ||
category_id, category_name = line.split("\t", 1) | ||
yield category_id, category_name | ||
|
||
|
||
def _parse_lexicon(lines, category_mapping): | ||
""" | ||
Read (match_expression, category_names) pairs from the lexicon section. | ||
Each line consists of a match expression followed by a tab and then one or more | ||
tab-separated integers, which are mapped to category names using `category_mapping`. | ||
""" | ||
for line in lines: | ||
line = line.strip() | ||
parts = line.split("\t") | ||
yield parts[0], [category_mapping[category_id] for category_id in parts[1:]] | ||
|
||
|
||
def read_dic(filepath): | ||
""" | ||
Reads a LIWC lexicon from a file in the .dic format, returning a tuple of | ||
(lexicon, category_names), where: | ||
* `lexicon` is a dict mapping string patterns to lists of category names | ||
* `categories` is a list of category names (as strings) | ||
* `category_names` is a list of category names (as strings) | ||
""" | ||
# category_mapping is a mapping from integer string to category name | ||
category_mapping = {} | ||
# category_names is equivalent to category_mapping.values() but retains original ordering | ||
category_names = [] | ||
lexicon = {} | ||
# the mode is incremented by each '%' line in the file | ||
mode = 0 | ||
for line in open(filepath): | ||
tsv = line.strip() | ||
if tsv: | ||
parts = tsv.split() | ||
if parts[0] == "%": | ||
mode += 1 | ||
elif mode == 1: | ||
# definining categories | ||
category_names.append(parts[1]) | ||
category_mapping[parts[0]] = parts[1] | ||
elif mode == 2: | ||
lexicon[parts[0]] = [ | ||
category_mapping[category_id] for category_id in parts[1:] | ||
] | ||
return lexicon, category_names | ||
with open(filepath) as lines: | ||
# read up to first "%" (should be very first line of file) | ||
for line in lines: | ||
if line.strip() == "%": | ||
break | ||
# read categories (a mapping from integer string to category name) | ||
category_mapping = dict(_parse_categories(lines)) | ||
# read lexicon (a mapping from matching string to a list of category names) | ||
lexicon = dict(_parse_lexicon(lines, category_mapping)) | ||
return lexicon, list(category_mapping.values()) |