[htm8 refactor] Cleanup, test refactoring, API design

oils-for-unix · Jan 16, 2025 · f167146 · f167146
1 parent def622c
commit f167146
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 103 deletions.
diff --git a/data_lang/htm8.py b/data_lang/htm8.py
@@ -464,7 +464,8 @@ def LookAhead(self, regex):
 #
 # For now, I guess we live with <a href=?foo/>
 
-A_VALUE_LEX = CHAR_LEX + [
+# What comes after = ?
+A_VALUE_LEX = [
     (r'"', h8_val_id.DoubleQuote),
     (r"'", h8_val_id.SingleQuote),
     (_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
@@ -478,6 +479,18 @@ def LookAhead(self, regex):
 
 A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
 
+# What's inside "" or '' ?
+QUOTED_VALUE_LEX = CHAR_LEX + [
+    (r'"', h8_id.DoubleQuote),
+    (r"'", h8_id.SingleQuote),
+    (r'<', h8_id.BadLessThan),  # BadAmpersand is in CharLex
+    (r'''[^"'<>&\x00]+''', h8_id.RawData),
+    # This includes > - it is not BadGreaterThan because it's NOT recoverable
+    (r'.', h8_id.Invalid),
+]
+
+QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
+
 
 class AttrLexer(object):
     """
@@ -490,7 +503,7 @@ class AttrLexer(object):
               print('div')
 
             # TODO: also pass Optional[List[]] out_tokens?
-            v, start_pos, end_pos = attr_lx.ReadRawValue()
+            v, start_pos, end_pos = attr_lx.ReadValue()
     """
 
     def __init__(self, s):
@@ -582,21 +595,21 @@ def AttrNameEquals(self, expected):
         return expected == self._CanonicalAttrName()
 
     def _QuotedRead(self):
-        # type: () -> Tuple[h8_id, end_pos]
+        # type: () -> Tuple[h8_id_t, int]
 
         for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
             m = pat.match(self.s, self.pos)
             if m:
                 end_pos = m.end(0)  # Advance
-                log('_QuotedRead %r', self.s[self.pos:end_pos])
+                #log('_QuotedRead %r', self.s[self.pos:end_pos])
                 return tok_id, end_pos
         else:
             context = self.s[self.pos:self.pos + 10]
             raise AssertionError('h8_id.Invalid rule should have matched %r' %
                                  context)
 
-    def ReadRawValue(self):
-        # type: () -> Tuple[attr_value_t, int, int]
+    def ReadValue(self, tokens_out=None):
+        # type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
         """Read the attribute value.
 
         In general, it is escaped or "raw"
@@ -621,9 +634,13 @@ def ReadRawValue(self):
                 self.pos = m.end(0)  # Advance
 
                 #log('m %s', m.groups())
+
+                # Note: Unquoted value can't contain &amp; etc. now, so there
+                # is no unquoting, and no respecting tokens_raw.
                 if a == h8_val_id.UnquotedVal:
                     return attr_value_e.Unquoted, m.start(0), m.end(0)
 
+                # TODO: respect tokens_out
                 if a == h8_val_id.DoubleQuote:
                     left_inner = self.pos
                     while True:
@@ -634,8 +651,8 @@ def ReadRawValue(self):
                             return attr_value_e.DoubleQuoted, left_inner, self.pos
                         self.pos = q_end_pos  # advance
 
+                # TODO: respect tokens_out
                 if a == h8_val_id.SingleQuote:
-
                     left_inner = self.pos
                     while True:
                         tok_id, q_end_pos = self._QuotedRead()
@@ -651,18 +668,10 @@ def ReadRawValue(self):
         else:
             raise AssertionError('h8_val_id.NoMatch rule should have matched')
 
-    def SkipValue(self):
-        # type: () -> None
-        # Just ignore it and return
-        self.ReadRawValue()
-
-    def ReadValueAndDecode(self):
-        # type: () -> str
-        """Read the attribute vlaue
-        """
-        # TODO: tokenize it
-        pass
 
+#
+# OLD API - REMOVE THIS
+#
 
 # Tag names:
 #   Match <a  or </a
@@ -894,17 +903,6 @@ def Tokens(self):
 
 ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
 
-QUOTED_VALUE_LEX = CHAR_LEX + [
-    (r'"', h8_id.DoubleQuote),
-    (r"'", h8_id.SingleQuote),
-    (r'<', h8_id.BadLessThan),  # BadAmpersand is in CharLex
-    (r'''[^"'<>&\x00]+''', h8_id.RawData),
-    # This includes > - it is not BadGreaterThan because it's NOT recoverable
-    (r'.', h8_id.Invalid),
-]
-
-QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
-
 
 class AttrValueLexer(object):
     """

diff --git a/data_lang/htm8_test.py b/data_lang/htm8_test.py
@@ -7,7 +7,7 @@
 import unittest
 import re
 
-from typing import List, Tuple
+from typing import List, Tuple, Any
 
 from data_lang import htm8
 from doctools.util import log
@@ -54,6 +54,20 @@ def testFindLineNum(self):
             print(line_num)
 
 
+def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
+    # type: (Any, str) -> htm8.AttrLexer
+
+    lx = htm8.Lexer(h)
+
+    tok_id, end_pos = lx.Read()
+    t.assertEqual(expected_tag, tok_id)
+
+    attr_lx = htm8.AttrLexer(h)
+    attr_lx.Init(lx.TagNamePos(), end_pos)
+
+    return attr_lx
+
+
 class AttrLexerTest(unittest.TestCase):
 
     def testNoAttrs(self):
@@ -82,43 +96,31 @@ def testNoAttrs(self):
         self.assertEqual(-1, name_end)
 
         try:
-            result = attr_lx.ReadRawValue()
+            result = attr_lx.ReadValue()
         except AssertionError as e:
             print(e)
         else:
             self.fail('should have failed')
 
     def testInvalid(self):
         h = '<a !>'
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartTag, tok_id)
-
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
+        attr_lx = _MakeAttrLexer(self, h)
 
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Invalid)
         self.assertEqual(-1, name_start)
         self.assertEqual(-1, name_end)
 
         try:
-            result = attr_lx.ReadRawValue()
+            result = attr_lx.ReadValue()
         except AssertionError as e:
             print(e)
         else:
             self.fail('should have failed')
 
     def testEmpty(self):
         h = '<img src=/>'
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartEndTag, tok_id)
-
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
 
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
@@ -129,21 +131,15 @@ def testEmpty(self):
         self.assertEqual(True, attr_lx.AttrNameEquals('src'))
         self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
 
-        v, attr_start, attr_end = attr_lx.ReadRawValue()
+        v, attr_start, attr_end = attr_lx.ReadValue()
         log('v = %s', attr_value_str(v))
         self.assertEqual(attr_value_e.Empty, v)
         self.assertEqual(-1, attr_start)
         self.assertEqual(-1, attr_end)
 
     def testMissing(self):
         h = '<img SRC/>'
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartEndTag, tok_id)
-
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
 
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
@@ -154,27 +150,22 @@ def testMissing(self):
         self.assertEqual(True, attr_lx.AttrNameEquals('src'))
         self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
 
-        v, attr_start, attr_end = attr_lx.ReadRawValue()
+        v, attr_start, attr_end = attr_lx.ReadValue()
         self.assertEqual(attr_value_e.Missing, v)
         self.assertEqual(-1, attr_start)
         self.assertEqual(-1, attr_end)
 
     def testUnquoted(self):
         # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
         h = '<a x=foo/>'
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartEndTag, tok_id)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
 
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
         self.assertEqual(3, name_start)
         self.assertEqual(4, name_end)
 
-        v, attr_start, attr_end = attr_lx.ReadRawValue()
+        v, attr_start, attr_end = attr_lx.ReadValue()
 
         log('v = %s', attr_value_str(v))
         log('unquoted val %r', h[attr_start:attr_end])
@@ -185,19 +176,14 @@ def testUnquoted(self):
 
     def testDoubleQuoted(self):
         h = '<a x="f&">'
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartTag, tok_id)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
 
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
         self.assertEqual(3, name_start)
         self.assertEqual(4, name_end)
 
-        v, attr_start, attr_end = attr_lx.ReadRawValue()
+        v, attr_start, attr_end = attr_lx.ReadValue()
 
         log('v = %s', attr_value_str(v))
         log('val %r', h[attr_start:attr_end])
@@ -208,19 +194,14 @@ def testDoubleQuoted(self):
 
     def testSingleQuoted(self):
         h = "<a x='&f'>"
-        lx = htm8.Lexer(h)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
 
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartTag, tok_id)
-
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
         self.assertEqual(3, name_start)
         self.assertEqual(4, name_end)
 
-        v, attr_start, attr_end = attr_lx.ReadRawValue()
+        v, attr_start, attr_end = attr_lx.ReadValue()
 
         log('v = %s', attr_value_str(v))
         log('unquoted val %r', h[attr_start:attr_end])
@@ -231,41 +212,31 @@ def testSingleQuoted(self):
 
     def testDoubleQuoted_Bad(self):
         h = '<a x="foo>'
-        lx = htm8.Lexer(h)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
 
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartTag, tok_id)
-
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
         self.assertEqual(3, name_start)
         self.assertEqual(4, name_end)
 
         try:
-            v, attr_start, attr_end = attr_lx.ReadRawValue()
+            v, attr_start, attr_end = attr_lx.ReadValue()
         except htm8.LexError as e:
             print(e)
         else:
             self.fail('Expected LexError')
 
     def testSingleQuoted_Bad(self):
         h = "<a x='foo>"
-        lx = htm8.Lexer(h)
-
-        tok_id, end_pos = lx.Read()
-        self.assertEqual(h8_id.StartTag, tok_id)
+        attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
 
-        attr_lx = htm8.AttrLexer(h)
-        attr_lx.Init(lx.TagNamePos(), end_pos)
         n, name_start, name_end = attr_lx.ReadName()
         self.assertEqual(n, attr_name.Ok)
         self.assertEqual(3, name_start)
         self.assertEqual(4, name_end)
 
         try:
-            v, attr_start, attr_end = attr_lx.ReadRawValue()
+            v, attr_start, attr_end = attr_lx.ReadValue()
         except htm8.LexError as e:
             print(e)
         else: