Refactor read_dic to be a bit more strict

Related to Issue #3
chbrown · Dec 9, 2019 · 13d7c3a · 13d7c3a
1 parent 2e97b8a
commit 13d7c3a
Showing 1 changed file with 39 additions and 24 deletions.
diff --git a/liwc/dic.py b/liwc/dic.py
@@ -1,30 +1,45 @@
+def _parse_categories(lines):
+    """
+    Read (category_id, category_name) pairs from the categories section.
+    Each line consists of an integer followed a tab and then the category name.
+    This section is separated from the lexicon by a line consisting of a single "%".
+    """
+    for line in lines:
+        line = line.strip()
+        if line == "%":
+            return
+        # ignore non-matching groups of categories
+        if "\t" in line:
+            category_id, category_name = line.split("\t", 1)
+            yield category_id, category_name
+
+
+def _parse_lexicon(lines, category_mapping):
+    """
+    Read (match_expression, category_names) pairs from the lexicon section.
+    Each line consists of a match expression followed by a tab and then one or more
+    tab-separated integers, which are mapped to category names using `category_mapping`.
+    """
+    for line in lines:
+        line = line.strip()
+        parts = line.split("\t")
+        yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]
+
+
 def read_dic(filepath):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (lexicon, category_names), where:
     * `lexicon` is a dict mapping string patterns to lists of category names
-    * `categories` is a list of category names (as strings)
-
+    * `category_names` is a list of category names (as strings)
     """
-    # category_mapping is a mapping from integer string to category name
-    category_mapping = {}
-    # category_names is equivalent to category_mapping.values() but retains original ordering
-    category_names = []
-    lexicon = {}
-    # the mode is incremented by each '%' line in the file
-    mode = 0
-    for line in open(filepath):
-        tsv = line.strip()
-        if tsv:
-            parts = tsv.split()
-            if parts[0] == "%":
-                mode += 1
-            elif mode == 1:
-                # definining categories
-                category_names.append(parts[1])
-                category_mapping[parts[0]] = parts[1]
-            elif mode == 2:
-                lexicon[parts[0]] = [
-                    category_mapping[category_id] for category_id in parts[1:]
-                ]
-    return lexicon, category_names
+    with open(filepath) as lines:
+        # read up to first "%" (should be very first line of file)
+        for line in lines:
+            if line.strip() == "%":
+                break
+        # read categories (a mapping from integer string to category name)
+        category_mapping = dict(_parse_categories(lines))
+        # read lexicon (a mapping from matching string to a list of category names)
+        lexicon = dict(_parse_lexicon(lines, category_mapping))
+    return lexicon, list(category_mapping.values())