Skip to content

Commit

Permalink
Refactor read_dic to be a bit more strict
Browse files Browse the repository at this point in the history
Related to Issue #3
  • Loading branch information
chbrown committed Dec 9, 2019
1 parent 2e97b8a commit 13d7c3a
Showing 1 changed file with 39 additions and 24 deletions.
63 changes: 39 additions & 24 deletions liwc/dic.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,45 @@
def _parse_categories(lines):
"""
Read (category_id, category_name) pairs from the categories section.
Each line consists of an integer followed a tab and then the category name.
This section is separated from the lexicon by a line consisting of a single "%".
"""
for line in lines:
line = line.strip()
if line == "%":
return
# ignore non-matching groups of categories
if "\t" in line:
category_id, category_name = line.split("\t", 1)
yield category_id, category_name


def _parse_lexicon(lines, category_mapping):
"""
Read (match_expression, category_names) pairs from the lexicon section.
Each line consists of a match expression followed by a tab and then one or more
tab-separated integers, which are mapped to category names using `category_mapping`.
"""
for line in lines:
line = line.strip()
parts = line.split("\t")
yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]


def read_dic(filepath):
"""
Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
(lexicon, category_names), where:
* `lexicon` is a dict mapping string patterns to lists of category names
* `categories` is a list of category names (as strings)
* `category_names` is a list of category names (as strings)
"""
# category_mapping is a mapping from integer string to category name
category_mapping = {}
# category_names is equivalent to category_mapping.values() but retains original ordering
category_names = []
lexicon = {}
# the mode is incremented by each '%' line in the file
mode = 0
for line in open(filepath):
tsv = line.strip()
if tsv:
parts = tsv.split()
if parts[0] == "%":
mode += 1
elif mode == 1:
# definining categories
category_names.append(parts[1])
category_mapping[parts[0]] = parts[1]
elif mode == 2:
lexicon[parts[0]] = [
category_mapping[category_id] for category_id in parts[1:]
]
return lexicon, category_names
with open(filepath) as lines:
# read up to first "%" (should be very first line of file)
for line in lines:
if line.strip() == "%":
break
# read categories (a mapping from integer string to category name)
category_mapping = dict(_parse_categories(lines))
# read lexicon (a mapping from matching string to a list of category names)
lexicon = dict(_parse_lexicon(lines, category_mapping))
return lexicon, list(category_mapping.values())

0 comments on commit 13d7c3a

Please sign in to comment.