From 13d7c3ae814c0d023fdf7ff44e75a95ce87574ad Mon Sep 17 00:00:00 2001
From: Christopher Brown <io@henrian.com>
Date: Sun, 8 Dec 2019 23:15:23 -0500
Subject: [PATCH] Refactor read_dic to be a bit more strict

Related to Issue #3
---
 liwc/dic.py | 63 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/liwc/dic.py b/liwc/dic.py
index 2deda05..b9d4f0c 100644
--- a/liwc/dic.py
+++ b/liwc/dic.py
@@ -1,30 +1,45 @@
+def _parse_categories(lines):
+    """
+    Read (category_id, category_name) pairs from the categories section.
+    Each line consists of an integer followed a tab and then the category name.
+    This section is separated from the lexicon by a line consisting of a single "%".
+    """
+    for line in lines:
+        line = line.strip()
+        if line == "%":
+            return
+        # ignore non-matching groups of categories
+        if "\t" in line:
+            category_id, category_name = line.split("\t", 1)
+            yield category_id, category_name
+
+
+def _parse_lexicon(lines, category_mapping):
+    """
+    Read (match_expression, category_names) pairs from the lexicon section.
+    Each line consists of a match expression followed by a tab and then one or more
+    tab-separated integers, which are mapped to category names using `category_mapping`.
+    """
+    for line in lines:
+        line = line.strip()
+        parts = line.split("\t")
+        yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]
+
+
 def read_dic(filepath):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (lexicon, category_names), where:
     * `lexicon` is a dict mapping string patterns to lists of category names
-    * `categories` is a list of category names (as strings)
-
+    * `category_names` is a list of category names (as strings)
     """
-    # category_mapping is a mapping from integer string to category name
-    category_mapping = {}
-    # category_names is equivalent to category_mapping.values() but retains original ordering
-    category_names = []
-    lexicon = {}
-    # the mode is incremented by each '%' line in the file
-    mode = 0
-    for line in open(filepath):
-        tsv = line.strip()
-        if tsv:
-            parts = tsv.split()
-            if parts[0] == "%":
-                mode += 1
-            elif mode == 1:
-                # definining categories
-                category_names.append(parts[1])
-                category_mapping[parts[0]] = parts[1]
-            elif mode == 2:
-                lexicon[parts[0]] = [
-                    category_mapping[category_id] for category_id in parts[1:]
-                ]
-    return lexicon, category_names
+    with open(filepath) as lines:
+        # read up to first "%" (should be very first line of file)
+        for line in lines:
+            if line.strip() == "%":
+                break
+        # read categories (a mapping from integer string to category name)
+        category_mapping = dict(_parse_categories(lines))
+        # read lexicon (a mapping from matching string to a list of category names)
+        lexicon = dict(_parse_lexicon(lines, category_mapping))
+    return lexicon, list(category_mapping.values())