From aee71f136719ab917e3b767c6ef937e8a351767c Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Sat, 18 Jan 2025 12:15:47 -0500
Subject: [PATCH] [htm8 refactor] Move old TagLexer API out

I tried to refactor this, but the old code is crufty and heavily
stateful.  Gah.

I think we need efficient CSS-style selectors for it.  There is a bunch
of copy and paste across

    doctools/oils_doc.py
    doctools/help_gen.py
---
 build/doc.sh           |  28 ++++-
 data_lang/htm8.py      | 240 ++---------------------------------------
 data_lang/htm8_test.py |  28 -----
 doctools/help_gen.py   |   4 +-
 doctools/oils_doc.py   |   8 +-
 lazylex/html.py        | 224 +++++++++++++++++++++++++++++++++++++-
 lazylex/html_test.py   |  29 +++++
 7 files changed, 290 insertions(+), 271 deletions(-)
diff --git a/build/doc.sh b/build/doc.sh
index 32177f837..03f16ea2f 100755
--- a/build/doc.sh
+++ b/build/doc.sh
@@ -173,7 +173,7 @@ readonly MARKDOWN_DOCS=(
 # A better fix would be to implement json_utf8.load(f), which doesn't decode
 # into unicode instances.  This would remove useless conversions.
 
-readonly TIMESTAMP=$(date --rfc-email)
+DOC_TIMESTAMP=${DOC_TIMESTAMP:-$(date --rfc-email)}
 
 split-and-render() {
   local src=${1:-doc/known-differences.md}
@@ -194,7 +194,7 @@ split-and-render() {
   local css_files="$web_url/base.css $web_url/manual.css $web_url/toc.css $web_url/language.css $web_url/code.css"
 
   PYTHONPATH='.:vendor' doctools/split_doc.py \
-    -v build_timestamp="$TIMESTAMP" \
+    -v build_timestamp="$DOC_TIMESTAMP" \
     -v oil_version="$OIL_VERSION" \
     -v css_files="$css_files" \
     -v all_docs_url='.' \
@@ -268,7 +268,7 @@ render-only() {
   "css_files": "$css_files",
   "all_docs_url": ".",
 
-  "build_timestamp": "$TIMESTAMP",
+  "build_timestamp": "$DOC_TIMESTAMP",
   "oil_version": "$OIL_VERSION"
 }
 EOF
@@ -750,5 +750,27 @@ soil-run() {
   run-for-release
 }
 
+#
+# Golden tests
+#
+# $0 golden-tree
+# $0 determnistic-build  # with new code
+# $0 compare-golden
+
+deterministic() {
+  # build without varying timestamp
+  DOC_TIMESTAMP='GOLD' $0 soil-run
+}
+
+golden-tree() {
+  rm -r -f _release/VERSION/ _release/VERSION_gold/
+  deterministic
+  cp -r _release/VERSION/ _release/VERSION_gold
+}
+
+compare-golden() {
+  diff -r -u _release/VERSION_gold _release/VERSION/ 
+}
+
 "$@"
 
diff --git a/data_lang/htm8.py b/data_lang/htm8.py
index 3e6db5838..b64e69c8d 100644
--- a/data_lang/htm8.py
+++ b/data_lang/htm8.py
@@ -42,10 +42,9 @@
 
 import re
 
-from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
+from typing import Dict, List, Tuple, Optional, IO, Any
 
-from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
-                                     h8_tag_id_str, attr_name, attr_name_t,
+from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, attr_name, attr_name_t,
                                      attr_name_str, attr_value_e, attr_value_t,
                                      h8_val_id)
 from doctools.util import log
@@ -162,14 +161,15 @@ def MakeLexer(rules):
 # Lexers
 #
 
-_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*'  # must start with letter
+_NAME_RE = r'[a-zA-Z][a-zA-Z0-9:_\-]*'  # must start with letter
 
 CHAR_LEX = [
     # Characters
     # https://www.w3.org/TR/xml/#sec-references
     (r'&\# [0-9]+ ;', h8_id.DecChar),
     (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
-    (r'& %s ;' % _NAME, h8_id.CharEntity),
+    # TODO: shouldn't use _NAME_RE?  Just letters
+    (r'& %s ;' % _NAME_RE, h8_id.CharEntity),
     # Allow unquoted, and quoted
     (r'&', h8_id.BadAmpersand),
 ]
@@ -203,11 +203,11 @@ def MakeLexer(rules):
     # - We look for a valid tag name, but we don't validate attributes.
     #   That's done in the tag lexer.
     # - We don't allow leading whitespace
-    (r'</ (%s) >' % _NAME, h8_id.EndTag),
+    (r'</ (%s) >' % _NAME_RE, h8_id.EndTag),
     # self-closing <br/>  comes before StartTag
     # could/should these be collapsed into one rule?
-    (r'<  (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag),  # end </a>
-    (r'<  (%s) [^>\x00]* >' % _NAME, h8_id.StartTag),  # start <a>
+    (r'<  (%s) [^>\x00]* />' % _NAME_RE, h8_id.StartEndTag),  # end </a>
+    (r'<  (%s) [^>\x00]* >' % _NAME_RE, h8_id.StartTag),  # start <a>
 
     # HTML5 allows unescaped > in raw data, but < is not allowed.
     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
@@ -395,7 +395,7 @@ def LookAhead(self, regex):
     #
     # If the = is not present, then we set the lexer in a state for
     # attr_value_e.Missing.
-    (r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
+    (r'\s+ (%s) \s* (=)? \s*' % _NAME_RE, attr_name.Ok),
     # unexpected EOF
 
     # The closing > or /> is treated as end of stream, and it's not an error.
@@ -420,8 +420,6 @@ def LookAhead(self, regex):
 # it's not common.  It opens up the j"" and $"" extensions
 # <a href = what'foo' >        # ditto
 
-# TODO: get rid of OLD copy
-_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
 _UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
 
 # What comes after = ?
@@ -758,223 +756,3 @@ def AllAttrsRaw(attr_lx):
         v = s[val_start:val_end]
         pairs.append((n, v))
     return pairs
-
-
-#
-# OLD API - REMOVE THIS
-#
-
-# Tag names:
-#   Match <a  or </a
-#   Match <h2, but not <2h
-#
-# HTML 5 doesn't restrict tag names at all
-#   https://html.spec.whatwg.org/#toc-syntax
-#
-# XML allows : - .
-#  https://www.w3.org/TR/xml/#NT-NameChar
-
-# Namespaces for MathML, SVG
-# XLink, XML, XMLNS
-#
-# https://infra.spec.whatwg.org/#namespaces
-#
-# Allow - for td-attrs
-
-# TODO: we don't need to capture the tag name here?  That's done at the top
-# level
-_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
-
-_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
-
-# To match href="foo"
-# Note: in HTML5 and XML, single quoted attributes are also valid
-
-# <button disabled> is standard usage
-
-# NOTE: This used to allow whitespace around =
-# <a foo = "bar">  makes sense in XML
-# But then you also have
-# <a foo= bar> - which is TWO attributes, in HTML5
-# So the space is problematic
-
-_ATTR_RE = re.compile(
-    r'''
-\s+                     # Leading whitespace is required
-(%s)                    # Attribute name
-(?:                     # Optional attribute value
-  \s* = \s*             # Spaces allowed around =
-  (?:
-    " ([^>"\x00]*) "    # double quoted value
-  | ' ([^>'\x00]*) '    # single quoted value
-  | (%s)                # Attribute value
-  )
-)?             
-''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
-
-
-class TagLexer(object):
-    """
-    Given a tag like <a href="..."> or <link type="..." />, the TagLexer
-    provides a few operations:
-
-    - What is the tag?
-    - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
-    """
-
-    def __init__(self, s):
-        # type: (str) -> None
-        self.s = s
-        self.start_pos = -1  # Invalid
-        self.end_pos = -1
-
-    def Reset(self, start_pos, end_pos):
-        # type: (int, int) -> None
-        """Reuse instances of this object."""
-        assert start_pos >= 0, start_pos
-        assert end_pos >= 0, end_pos
-
-        self.start_pos = start_pos
-        self.end_pos = end_pos
-
-    def WholeTagString(self):
-        # type: () -> str
-        """Return the entire tag string, e.g. <a href='foo'>"""
-        return self.s[self.start_pos:self.end_pos]
-
-    def GetTagName(self):
-        # type: () -> str
-        # First event
-        tok_id, start, end = next(self.Tokens())
-        return self.s[start:end]
-
-    def GetSpanForAttrValue(self, attr_name):
-        # type: (str) -> Tuple[int, int]
-        """
-        Used by oils_doc.py, for href shortcuts
-        """
-        # Algorithm: search for QuotedValue or UnquotedValue after AttrName
-        # TODO: Could also cache these
-
-        events = self.Tokens()
-        val = (-1, -1)
-        try:
-            while True:
-                tok_id, start, end = next(events)
-                if tok_id == h8_tag_id.AttrName:
-                    name = self.s[start:end]
-                    if name == attr_name:
-                        # The value should come next
-                        tok_id, start, end = next(events)
-                        assert tok_id in (
-                            h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
-                            h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
-                        val = start, end
-                        break
-
-        except StopIteration:
-            pass
-        return val
-
-    def GetAttrRaw(self, attr_name):
-        # type: (str) -> Optional[str]
-        """
-        Return the value, which may be UNESCAPED.
-        """
-        start, end = self.GetSpanForAttrValue(attr_name)
-        if start == -1:
-            return None
-        return self.s[start:end]
-
-    def AllAttrsRawSlice(self):
-        # type: () -> List[Tuple[str, int, int]]
-        """
-        Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
-        """
-        slices = []
-        events = self.Tokens()
-        try:
-            while True:
-                tok_id, start, end = next(events)
-                if tok_id == h8_tag_id.AttrName:
-                    name = self.s[start:end]
-
-                    # The value should come next
-                    tok_id, start, end = next(events)
-                    assert tok_id in (
-                        h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
-                        h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
-                    # Note: quoted values may have &amp;
-                    # We would need ANOTHER lexer to unescape them, but we
-                    # don't need that for ul-table
-                    slices.append((name, start, end))
-        except StopIteration:
-            pass
-        return slices
-
-    def AllAttrsRaw(self):
-        # type: () -> List[Tuple[str, str]]
-        """
-        Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
-
-        The quoted values may be escaped.  We would need another lexer to
-        unescape them.
-        """
-        slices = self.AllAttrsRawSlice()
-        pairs = []
-        for name, start, end in slices:
-            pairs.append((name, self.s[start:end]))
-        return pairs
-
-    def Tokens(self):
-        # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
-        """
-        Yields a sequence of tokens: Tag (AttrName AttrValue?)*
-
-        Where each Token is (Type, start_pos, end_pos)
-
-        Note that start and end are NOT redundant!  We skip over some unwanted
-        characters.
-        """
-        m = _TAG_RE.match(self.s, self.start_pos + 1)
-        if not m:
-            raise RuntimeError("Couldn't find HTML tag in %r" %
-                               self.WholeTagString())
-        yield h8_tag_id.TagName, m.start(1), m.end(1)
-
-        pos = m.end(0)
-        #log('POS %d', pos)
-
-        while True:
-            # don't search past the end
-            m = _ATTR_RE.match(self.s, pos, self.end_pos)
-            if not m:
-                #log('BREAK pos %d', pos)
-                break
-            #log('AttrName %r', m.group(1))
-
-            yield h8_tag_id.AttrName, m.start(1), m.end(1)
-
-            #log('m.groups() %r', m.groups())
-            if m.group(2) is not None:
-                # double quoted
-                yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
-            elif m.group(3) is not None:
-                # single quoted - TODO: could have different token types
-                yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
-            elif m.group(4) is not None:
-                yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
-            else:
-                # <button disabled>
-                end = m.end(0)
-                yield h8_tag_id.MissingValue, end, end
-
-            # Skip past the "
-            pos = m.end(0)
-
-        #log('TOK %r', self.s)
-
-        m = _TAG_LAST_RE.match(self.s, pos)
-        #log('_TAG_LAST_RE match %r', self.s[pos:])
-        if not m:
-            raise LexError('Extra data at end of tag', self.s, pos)
diff --git a/data_lang/htm8_test.py b/data_lang/htm8_test.py
index 5fb7cbdba..25f58411f 100755
--- a/data_lang/htm8_test.py
+++ b/data_lang/htm8_test.py
@@ -17,34 +17,6 @@
     TEST_HTML = f.read()
 
 
-class RegexTest(unittest.TestCase):
-
-    def testDotAll(self):
-        # type: () -> None
-
-        # Note that $ matches end of line, not end of string
-        p1 = re.compile(r'.')
-        print(p1.match('\n'))
-
-        p2 = re.compile(r'.', re.DOTALL)
-        print(p2.match('\n'))
-
-        #p3 = re.compile(r'[.\n]', re.VERBOSE)
-        p3 = re.compile(r'[.\n]')
-        print(p3.match('\n'))
-
-        print('Negation')
-
-        p4 = re.compile(r'[^>]')
-        print(p4.match('\n'))
-
-    def testAttrRe(self):
-        # type: () -> None
-        _ATTR_RE = htm8._ATTR_RE
-        m = _ATTR_RE.match(' empty= val')
-        print(m.groups())
-
-
 class FunctionsTest(unittest.TestCase):
 
     def testFindLineNum(self):
diff --git a/doctools/help_gen.py b/doctools/help_gen.py
index 3eed53013..3153cb87b 100755
--- a/doctools/help_gen.py
+++ b/doctools/help_gen.py
@@ -308,7 +308,7 @@ def ExtractBody(s):
     """
     f = cStringIO.StringIO()
     out = htm8.Output(s, f)
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     pos = 0
     it = html.ValidTokens(s)
@@ -364,7 +364,7 @@ def HelpTopics(s):
 
     yield groups (section_id, section_name, block of text)
     """
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     pos = 0
     it = html.ValidTokens(s)
diff --git a/doctools/oils_doc.py b/doctools/oils_doc.py
index a698bead9..76a9262a4 100755
--- a/doctools/oils_doc.py
+++ b/doctools/oils_doc.py
@@ -112,7 +112,7 @@ def ExpandLinks(s):
     f = StringIO()
     out = htm8.Output(s, f)
 
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     pos = 0
 
@@ -338,7 +338,7 @@ def SimpleHighlightCode(s):
     f = StringIO()
     out = htm8.Output(s, f)
 
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     pos = 0
 
@@ -398,7 +398,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
     f = StringIO()
     out = htm8.Output(s, f)
 
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     pos = 0
 
@@ -556,7 +556,7 @@ def ExtractCode(s, f):
     2. Decode &amp; -> &,e tc. and return it
     """
     out = htm8.Output(s, f)
-    tag_lexer = htm8.TagLexer(s)
+    tag_lexer = html.TagLexer(s)
 
     block_num = 0
     pos = 0
diff --git a/lazylex/html.py b/lazylex/html.py
index 9826dbb0b..0e1fc4798 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -7,9 +7,12 @@
 """
 from __future__ import print_function
 
-from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
+import re
+
+from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id,
+                                     h8_tag_id_t, h8_tag_id_str)
 from data_lang import htm8
-from data_lang.htm8 import (Lexer, TagLexer, LexError, ParseError, Output)
+from data_lang.htm8 import (Lexer, LexError, ParseError, Output, _NAME_RE)
 from doctools.util import log
 
 try:
@@ -20,7 +23,7 @@
 import sys
 
 if sys.version_info.major == 2:
-    from typing import List, Tuple, Iterator
+    from typing import List, Tuple, Iterator, Optional
 
 
 def _Tokens(s, left_pos, right_pos):
@@ -343,6 +346,221 @@ def __init__(self):
         #self.debug_attrs = []
 
 
+#
+# OLD TagLexer API - REMOVE THIS
+#
+# HTML 5 doesn't restrict tag names at all
+#   https://html.spec.whatwg.org/#toc-syntax
+#
+# XML allows : - .
+#  https://www.w3.org/TR/xml/#NT-NameChar
+
+# Namespaces for MathML, SVG
+# XLink, XML, XMLNS
+#
+# https://infra.spec.whatwg.org/#namespaces
+#
+# Allow - for td-attrs
+
+# Similar to _UNQUOTED_VALUE in data_lang/htm8.py
+_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
+
+_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME_RE, re.VERBOSE)
+
+_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
+
+# To match href="foo"
+
+# <button disabled> is standard usage
+
+# NOTE: This used to allow whitespace around =
+# <a foo = "bar">  makes sense in XML
+# But then you also have
+# <a foo= bar> - which is TWO attributes, in HTML5
+# So the space is problematic
+
+_ATTR_RE = re.compile(
+    r'''
+\s+                     # Leading whitespace is required
+(%s)                    # Attribute name
+(?:                     # Optional attribute value
+  \s* = \s*             # Spaces allowed around =
+  (?:
+    " ([^>"\x00]*) "    # double quoted value
+  | ' ([^>'\x00]*) '    # single quoted value
+  | (%s)                # Attribute value
+  )
+)?             
+''' % (_NAME_RE, _UNQUOTED_VALUE_OLD), re.VERBOSE)
+
+
+class TagLexer(object):
+    """
+    Given a tag like <a href="..."> or <link type="..." />, the TagLexer
+    provides a few operations:
+
+    - What is the tag?
+    - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
+    """
+
+    def __init__(self, s):
+        # type: (str) -> None
+        self.s = s
+        self.start_pos = -1  # Invalid
+        self.end_pos = -1
+
+    def Reset(self, start_pos, end_pos):
+        # type: (int, int) -> None
+        """Reuse instances of this object."""
+        assert start_pos >= 0, start_pos
+        assert end_pos >= 0, end_pos
+
+        self.start_pos = start_pos
+        self.end_pos = end_pos
+
+    def WholeTagString(self):
+        # type: () -> str
+        """Return the entire tag string, e.g. <a href='foo'>"""
+        return self.s[self.start_pos:self.end_pos]
+
+    def GetTagName(self):
+        # type: () -> str
+        # First event
+        tok_id, start, end = next(self.Tokens())
+        return self.s[start:end]
+
+    def GetSpanForAttrValue(self, attr_name):
+        # type: (str) -> Tuple[int, int]
+        """
+        Used by oils_doc.py, for href shortcuts
+        """
+        # Algorithm: search for QuotedValue or UnquotedValue after AttrName
+        # TODO: Could also cache these
+
+        events = self.Tokens()
+        val = (-1, -1)
+        try:
+            while True:
+                tok_id, start, end = next(events)
+                if tok_id == h8_tag_id.AttrName:
+                    name = self.s[start:end]
+                    if name == attr_name:
+                        # The value should come next
+                        tok_id, start, end = next(events)
+                        assert tok_id in (
+                            h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
+                            h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
+                        val = start, end
+                        break
+
+        except StopIteration:
+            pass
+        return val
+
+    def GetAttrRaw(self, attr_name):
+        # type: (str) -> Optional[str]
+        """
+        Return the value, which may be UNESCAPED.
+        """
+        start, end = self.GetSpanForAttrValue(attr_name)
+        if start == -1:
+            return None
+        return self.s[start:end]
+
+    def AllAttrsRawSlice(self):
+        # type: () -> List[Tuple[str, int, int]]
+        """
+        Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
+        """
+        slices = []
+        events = self.Tokens()
+        try:
+            while True:
+                tok_id, start, end = next(events)
+                if tok_id == h8_tag_id.AttrName:
+                    name = self.s[start:end]
+
+                    # The value should come next
+                    tok_id, start, end = next(events)
+                    assert tok_id in (
+                        h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
+                        h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
+                    # Note: quoted values may have &amp;
+                    # We would need ANOTHER lexer to unescape them, but we
+                    # don't need that for ul-table
+                    slices.append((name, start, end))
+        except StopIteration:
+            pass
+        return slices
+
+    def AllAttrsRaw(self):
+        # type: () -> List[Tuple[str, str]]
+        """
+        Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
+
+        The quoted values may be escaped.  We would need another lexer to
+        unescape them.
+        """
+        slices = self.AllAttrsRawSlice()
+        pairs = []
+        for name, start, end in slices:
+            pairs.append((name, self.s[start:end]))
+        return pairs
+
+    def Tokens(self):
+        # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
+        """
+        Yields a sequence of tokens: Tag (AttrName AttrValue?)*
+
+        Where each Token is (Type, start_pos, end_pos)
+
+        Note that start and end are NOT redundant!  We skip over some unwanted
+        characters.
+        """
+        m = _TAG_RE.match(self.s, self.start_pos + 1)
+        if not m:
+            raise RuntimeError("Couldn't find HTML tag in %r" %
+                               self.WholeTagString())
+        yield h8_tag_id.TagName, m.start(1), m.end(1)
+
+        pos = m.end(0)
+        #log('POS %d', pos)
+
+        while True:
+            # don't search past the end
+            m = _ATTR_RE.match(self.s, pos, self.end_pos)
+            if not m:
+                #log('BREAK pos %d', pos)
+                break
+            #log('AttrName %r', m.group(1))
+
+            yield h8_tag_id.AttrName, m.start(1), m.end(1)
+
+            #log('m.groups() %r', m.groups())
+            if m.group(2) is not None:
+                # double quoted
+                yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
+            elif m.group(3) is not None:
+                # single quoted - TODO: could have different token types
+                yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
+            elif m.group(4) is not None:
+                yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
+            else:
+                # <button disabled>
+                end = m.end(0)
+                yield h8_tag_id.MissingValue, end, end
+
+            # Skip past the "
+            pos = m.end(0)
+
+        #log('TOK %r', self.s)
+
+        m = _TAG_LAST_RE.match(self.s, pos)
+        #log('_TAG_LAST_RE match %r', self.s[pos:])
+        if not m:
+            raise LexError('Extra data at end of tag', self.s, pos)
+
+
 def main(argv):
     # type: (List[str]) -> int
     action = argv[1]
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index e4e7e17b6..06e01096d 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -1,12 +1,41 @@
 #!/usr/bin/env python2
 from __future__ import print_function
 
+import re
 import unittest
 
 from lazylex import html  # module under test log = html.log
 from doctools.util import log
 
 
+class RegexTest(unittest.TestCase):
+
+    def testDotAll(self):
+        # type: () -> None
+
+        # Note that $ matches end of line, not end of string
+        p1 = re.compile(r'.')
+        print(p1.match('\n'))
+
+        p2 = re.compile(r'.', re.DOTALL)
+        print(p2.match('\n'))
+
+        #p3 = re.compile(r'[.\n]', re.VERBOSE)
+        p3 = re.compile(r'[.\n]')
+        print(p3.match('\n'))
+
+        print('Negation')
+
+        p4 = re.compile(r'[^>]')
+        print(p4.match('\n'))
+
+    def testAttrRe(self):
+        # type: () -> None
+        _ATTR_RE = html._ATTR_RE
+        m = _ATTR_RE.match(' empty= val')
+        print(m.groups())
+
+
 class FunctionsTest(unittest.TestCase):
 
     def testToText(self):