From 13d7c3ae814c0d023fdf7ff44e75a95ce87574ad Mon Sep 17 00:00:00 2001 From: Christopher Brown Date: Sun, 8 Dec 2019 23:15:23 -0500 Subject: [PATCH] Refactor read_dic to be a bit more strict Related to Issue #3 --- liwc/dic.py | 63 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/liwc/dic.py b/liwc/dic.py index 2deda05..b9d4f0c 100644 --- a/liwc/dic.py +++ b/liwc/dic.py @@ -1,30 +1,45 @@ +def _parse_categories(lines): + """ + Read (category_id, category_name) pairs from the categories section. + Each line consists of an integer followed a tab and then the category name. + This section is separated from the lexicon by a line consisting of a single "%". + """ + for line in lines: + line = line.strip() + if line == "%": + return + # ignore non-matching groups of categories + if "\t" in line: + category_id, category_name = line.split("\t", 1) + yield category_id, category_name + + +def _parse_lexicon(lines, category_mapping): + """ + Read (match_expression, category_names) pairs from the lexicon section. + Each line consists of a match expression followed by a tab and then one or more + tab-separated integers, which are mapped to category names using `category_mapping`. + """ + for line in lines: + line = line.strip() + parts = line.split("\t") + yield parts[0], [category_mapping[category_id] for category_id in parts[1:]] + + def read_dic(filepath): """ Reads a LIWC lexicon from a file in the .dic format, returning a tuple of (lexicon, category_names), where: * `lexicon` is a dict mapping string patterns to lists of category names - * `categories` is a list of category names (as strings) - + * `category_names` is a list of category names (as strings) """ - # category_mapping is a mapping from integer string to category name - category_mapping = {} - # category_names is equivalent to category_mapping.values() but retains original ordering - category_names = [] - lexicon = {} - # the mode is incremented by each '%' line in the file - mode = 0 - for line in open(filepath): - tsv = line.strip() - if tsv: - parts = tsv.split() - if parts[0] == "%": - mode += 1 - elif mode == 1: - # definining categories - category_names.append(parts[1]) - category_mapping[parts[0]] = parts[1] - elif mode == 2: - lexicon[parts[0]] = [ - category_mapping[category_id] for category_id in parts[1:] - ] - return lexicon, category_names + with open(filepath) as lines: + # read up to first "%" (should be very first line of file) + for line in lines: + if line.strip() == "%": + break + # read categories (a mapping from integer string to category name) + category_mapping = dict(_parse_categories(lines)) + # read lexicon (a mapping from matching string to a list of category names) + lexicon = dict(_parse_lexicon(lines, category_mapping)) + return lexicon, list(category_mapping.values())