From 07ebeba7ea6a6b7ce866c5befbb0e40cc05fde10 Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Sun, 12 Jan 2025 10:44:53 -0500
Subject: [PATCH] [lazylex/html] Allow unescaped &

This occurs in real data, and is similar to allowing < and > (which we
don't do everywhere yet)

The XML philosophy lost :)
---
 data_lang/htm8-test.sh | 11 +----------
 doc/htm8.md            | 15 +++++++++++++++
 lazylex/html.py        | 11 +++++++++--
 lazylex/html_test.py   | 30 +++++++++++++++++++++++++-----
 4 files changed, 50 insertions(+), 17 deletions(-)
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
index b227ed689..377ea8fe9 100755
--- a/data_lang/htm8-test.sh
+++ b/data_lang/htm8-test.sh
@@ -26,17 +26,8 @@
 # - I may also want to do this with JSON
 #
 # Not working yet:
-# - understanding all entities &zz;
-#   - there are over 2000 of them, not sure I want to build them all into the Oils binaries
 # - capital letters <TR/> - I guess we can normalize the case
-#
-# Leniency:
-# - foo=1&bar=2 is extremely common
-# - well then does that mean you allow <p>a & b</b too?
-#   - and then it's not far from that to <p id="value >"> - the quotes help
-# - I guess you can have a rule for unescaped &, just like unescaped backslash
-#   - you can warn about it, but it doesn't cause much problem?
-# We are already firmly in HTML territory, not in XML ...
+#   - islower()
 #
 # Features:
 # - Are there special rules for <svg> and <math>?
diff --git a/doc/htm8.md b/doc/htm8.md
index e2847ebc0..a21290104 100644
--- a/doc/htm8.md
+++ b/doc/htm8.md
@@ -133,6 +133,21 @@ Conflicts between HTML5 and XML:
 - Maybe validate any other declarations, like `<!DOCTYPE foo>`
 - Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`
 
+## Leniency
+
+Angle brackets:
+
+- `<a foo="<">` is allowed, but `<a foo=">">` is disallowed
+- `<p> 4>3 </p>` is allowed, but `<p> 4<3 </p>` is disallowed
+
+This makes lexing the top-level structure easier.
+
+- unescaped `&` is allowed, unlike XML
+  - it's very common in `<a href="?foo=42&bar=99">`
+  - It's lexed as BadAmpersand, in case you want to fix it for XML.  Although
+    we don't do that for < and > consistently.
+
+
 ## Related
 
 - [ysh-doc-processing.html](ysh-doc-processing.html)
diff --git a/lazylex/html.py b/lazylex/html.py
index 671328a24..66beffe31 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -113,7 +113,7 @@ def Print(self, s):
 
 # HTML Tokens
 # CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
-TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
+TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split(
 )
 
 
@@ -184,6 +184,8 @@ def MakeLexer(rules):
     (r'&\# [0-9]+ ;', Tok.DecChar),
     (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
     (r'& %s ;' % _NAME, Tok.CharEntity),
+    # Allow unquoted, and quoted
+    (r'&', Tok.BadAmpersand),
 ]
 
 LEXER = CHAR_LEX + [
@@ -741,6 +743,7 @@ def ReadUntilEndTag(it, tag_lexer, tag_name):
     'lt': '<',
     'gt': '>',
     'quot': '"',
+    'apos': "'",
 }
 
 
@@ -763,7 +766,7 @@ def ToText(s, left_pos=0, right_pos=-1):
 
     pos = left_pos
     for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
-        if tok_id == Tok.RawData:
+        if tok_id in (Tok.RawData, Tok.BadAmpersand):
             out.SkipTo(pos)
             out.PrintUntil(end_pos)
 
@@ -782,6 +785,10 @@ def ToText(s, left_pos=0, right_pos=-1):
         elif tok_id == Tok.DecChar:
             raise AssertionError('Dec Char %r' % s[pos:pos + 20])
 
+        else:
+            # Skip everything else
+            out.SkipTo(end_pos)
+
         pos = end_pos
 
     out.PrintTheRest()
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index 502e82683..422073ac8 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -46,6 +46,10 @@ def testFindLineNum(self):
             line_num = html.FindLineNum(s, pos)
             print(line_num)
 
+    def testToText(self):
+        t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
+        self.assertEqual(' three < four && five ', t)
+
 
 def _MakeTagLexer(s):
     lex = html.TagLexer(s)
@@ -118,13 +122,15 @@ def testEmptyMissingValues(self):
         slices = lex.AllAttrsRawSlice()
         log('slices %s', slices)
 
-        lex = _MakeTagLexer('''<p double="" single='' empty= value missing>''')
+        lex = _MakeTagLexer(
+            '''<p double="" single='' empty= value missing empty2=>''')
         all_attrs = lex.AllAttrsRaw()
         self.assertEqual([
             ('double', ''),
             ('single', ''),
             ('empty', 'value'),
             ('missing', ''),
+            ('empty2', ''),
         ], all_attrs)
         # TODO: should have
         log('all %s', all_attrs)
@@ -338,12 +344,25 @@ def testInvalid(self):
             else:
                 self.fail('Expected LexError %r' % s)
 
+    def testValid(self):
+        for s in VALID_LEX:
+            tokens = Lex(s)
+            print()
+
+
+VALID_LEX = [
+    '<foo>',
+    '<foo x=y>',
+    '<foo x="&">',
+
+    # Allowed with BadAmpersand
+    '<p> x & y </p>',
+]
 
 INVALID_LEX = [
-    # Should be &amp;
-    '<a>&',
-    '&amp',  # not finished
-    '&#',  # not finished
+    '<a><',
+    '&amp<',
+    '&<',
     # Hm > is allowed?
     #'a > b',
     'a < b',
@@ -386,6 +405,7 @@ def testInvalid(self):
     # Conceding to reality - I used these myself
     '<a href=ble.sh></a>',
     '<a href=foo.html></a>',
+    '<foo x="&"></foo>',
 
     # TODO: capitalization should be allowed
     #'<META><a></a>',