From 07ebeba7ea6a6b7ce866c5befbb0e40cc05fde10 Mon Sep 17 00:00:00 2001 From: Andy C Date: Sun, 12 Jan 2025 10:44:53 -0500 Subject: [PATCH] [lazylex/html] Allow unescaped & This occurs in real data, and is similar to allowing < and > (which we don't do everywhere yet) The XML philosophy lost :) --- data_lang/htm8-test.sh | 11 +---------- doc/htm8.md | 15 +++++++++++++++ lazylex/html.py | 11 +++++++++-- lazylex/html_test.py | 30 +++++++++++++++++++++++++----- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh index b227ed689..377ea8fe9 100755 --- a/data_lang/htm8-test.sh +++ b/data_lang/htm8-test.sh @@ -26,17 +26,8 @@ # - I may also want to do this with JSON # # Not working yet: -# - understanding all entities &zz; -# - there are over 2000 of them, not sure I want to build them all into the Oils binaries # - capital letters - I guess we can normalize the case -# -# Leniency: -# - foo=1&bar=2 is extremely common -# - well then does that mean you allow

a & b - the quotes help -# - I guess you can have a rule for unescaped &, just like unescaped backslash -# - you can warn about it, but it doesn't cause much problem? -# We are already firmly in HTML territory, not in XML ... +# - islower() # # Features: # - Are there special rules for and ? diff --git a/doc/htm8.md b/doc/htm8.md index e2847ebc0..a21290104 100644 --- a/doc/htm8.md +++ b/doc/htm8.md @@ -133,6 +133,21 @@ Conflicts between HTML5 and XML: - Maybe validate any other declarations, like `` - Add XML header ``, remove `` +## Leniency + +Angle brackets: + +- `` is allowed, but `` is disallowed +- `

4>3

` is allowed, but `

4<3

` is disallowed + +This makes lexing the top-level structure easier. + +- unescaped `&` is allowed, unlike XML + - it's very common in `` + - It's lexed as BadAmpersand, in case you want to fix it for XML. Although + we don't do that for < and > consistently. + + ## Related - [ysh-doc-processing.html](ysh-doc-processing.html) diff --git a/lazylex/html.py b/lazylex/html.py index 671328a24..66beffe31 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -113,7 +113,7 @@ def Print(self, s): # HTML Tokens # CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible -TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split( +TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split( ) @@ -184,6 +184,8 @@ def MakeLexer(rules): (r'&\# [0-9]+ ;', Tok.DecChar), (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar), (r'& %s ;' % _NAME, Tok.CharEntity), + # Allow unquoted, and quoted + (r'&', Tok.BadAmpersand), ] LEXER = CHAR_LEX + [ @@ -741,6 +743,7 @@ def ReadUntilEndTag(it, tag_lexer, tag_name): 'lt': '<', 'gt': '>', 'quot': '"', + 'apos': "'", } @@ -763,7 +766,7 @@ def ToText(s, left_pos=0, right_pos=-1): pos = left_pos for tok_id, end_pos in ValidTokens(s, left_pos, right_pos): - if tok_id == Tok.RawData: + if tok_id in (Tok.RawData, Tok.BadAmpersand): out.SkipTo(pos) out.PrintUntil(end_pos) @@ -782,6 +785,10 @@ def ToText(s, left_pos=0, right_pos=-1): elif tok_id == Tok.DecChar: raise AssertionError('Dec Char %r' % s[pos:pos + 20]) + else: + # Skip everything else + out.SkipTo(end_pos) + pos = end_pos out.PrintTheRest() diff --git a/lazylex/html_test.py b/lazylex/html_test.py index 502e82683..422073ac8 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -46,6 +46,10 @@ def testFindLineNum(self): line_num = html.FindLineNum(s, pos) print(line_num) + def testToText(self): + t = html.ToText(' three < four && five ') + self.assertEqual(' three < four && five ', t) + def _MakeTagLexer(s): lex = html.TagLexer(s) @@ -118,13 +122,15 @@ def testEmptyMissingValues(self): slices = lex.AllAttrsRawSlice() log('slices %s', slices) - lex = _MakeTagLexer('''

''') + lex = _MakeTagLexer( + '''

''') all_attrs = lex.AllAttrsRaw() self.assertEqual([ ('double', ''), ('single', ''), ('empty', 'value'), ('missing', ''), + ('empty2', ''), ], all_attrs) # TODO: should have log('all %s', all_attrs) @@ -338,12 +344,25 @@ def testInvalid(self): else: self.fail('Expected LexError %r' % s) + def testValid(self): + for s in VALID_LEX: + tokens = Lex(s) + print() + + +VALID_LEX = [ + '', + '', + '', + + # Allowed with BadAmpersand + '

x & y

', +] INVALID_LEX = [ - # Should be & - '
&', - '&', # not finished - '&#', # not finished + '<', + '&<', + '&<', # Hm > is allowed? #'a > b', 'a < b', @@ -386,6 +405,7 @@ def testInvalid(self): # Conceding to reality - I used these myself '', '', + '', # TODO: capitalization should be allowed #'',