Skip to content

Commit

Permalink
[lazylex/html] Allow unescaped &
Browse files Browse the repository at this point in the history
This occurs in real data, and is similar to allowing < and > (which we
don't do everywhere yet)

The XML philosophy lost :)
  • Loading branch information
Andy C committed Jan 12, 2025
1 parent 2498981 commit 07ebeba
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 17 deletions.
11 changes: 1 addition & 10 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,8 @@
# - I may also want to do this with JSON
#
# Not working yet:
# - understanding all entities &zz;
# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
# - capital letters <TR/> - I guess we can normalize the case
#
# Leniency:
# - foo=1&bar=2 is extremely common
# - well then does that mean you allow <p>a & b</b too?
# - and then it's not far from that to <p id="value >"> - the quotes help
# - I guess you can have a rule for unescaped &, just like unescaped backslash
# - you can warn about it, but it doesn't cause much problem?
# We are already firmly in HTML territory, not in XML ...
# - islower()
#
# Features:
# - Are there special rules for <svg> and <math>?
Expand Down
15 changes: 15 additions & 0 deletions doc/htm8.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,21 @@ Conflicts between HTML5 and XML:
- Maybe validate any other declarations, like `<!DOCTYPE foo>`
- Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`

## Leniency

Angle brackets:

- `<a foo="<">` is allowed, but `<a foo=">">` is disallowed
- `<p> 4>3 </p>` is allowed, but `<p> 4<3 </p>` is disallowed

This makes lexing the top-level structure easier.

- unescaped `&` is allowed, unlike XML
- it's very common in `<a href="?foo=42&bar=99">`
- It's lexed as BadAmpersand, in case you want to fix it for XML. Although
we don't do that for < and > consistently.


## Related

- [ysh-doc-processing.html](ysh-doc-processing.html)
Expand Down
11 changes: 9 additions & 2 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def Print(self, s):

# HTML Tokens
# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split(
)


Expand Down Expand Up @@ -184,6 +184,8 @@ def MakeLexer(rules):
(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& %s ;' % _NAME, Tok.CharEntity),
# Allow unquoted, and quoted
(r'&', Tok.BadAmpersand),
]

LEXER = CHAR_LEX + [
Expand Down Expand Up @@ -741,6 +743,7 @@ def ReadUntilEndTag(it, tag_lexer, tag_name):
'lt': '<',
'gt': '>',
'quot': '"',
'apos': "'",
}


Expand All @@ -763,7 +766,7 @@ def ToText(s, left_pos=0, right_pos=-1):

pos = left_pos
for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
if tok_id == Tok.RawData:
if tok_id in (Tok.RawData, Tok.BadAmpersand):
out.SkipTo(pos)
out.PrintUntil(end_pos)

Expand All @@ -782,6 +785,10 @@ def ToText(s, left_pos=0, right_pos=-1):
elif tok_id == Tok.DecChar:
raise AssertionError('Dec Char %r' % s[pos:pos + 20])

else:
# Skip everything else
out.SkipTo(end_pos)

pos = end_pos

out.PrintTheRest()
Expand Down
30 changes: 25 additions & 5 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def testFindLineNum(self):
line_num = html.FindLineNum(s, pos)
print(line_num)

def testToText(self):
t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
self.assertEqual(' three < four && five ', t)


def _MakeTagLexer(s):
lex = html.TagLexer(s)
Expand Down Expand Up @@ -118,13 +122,15 @@ def testEmptyMissingValues(self):
slices = lex.AllAttrsRawSlice()
log('slices %s', slices)

lex = _MakeTagLexer('''<p double="" single='' empty= value missing>''')
lex = _MakeTagLexer(
'''<p double="" single='' empty= value missing empty2=>''')
all_attrs = lex.AllAttrsRaw()
self.assertEqual([
('double', ''),
('single', ''),
('empty', 'value'),
('missing', ''),
('empty2', ''),
], all_attrs)
# TODO: should have
log('all %s', all_attrs)
Expand Down Expand Up @@ -338,12 +344,25 @@ def testInvalid(self):
else:
self.fail('Expected LexError %r' % s)

def testValid(self):
for s in VALID_LEX:
tokens = Lex(s)
print()


VALID_LEX = [
'<foo>',
'<foo x=y>',
'<foo x="&">',

# Allowed with BadAmpersand
'<p> x & y </p>',
]

INVALID_LEX = [
# Should be &amp;
'<a>&',
'&amp', # not finished
'&#', # not finished
'<a><',
'&amp<',
'&<',
# Hm > is allowed?
#'a > b',
'a < b',
Expand Down Expand Up @@ -386,6 +405,7 @@ def testInvalid(self):
# Conceding to reality - I used these myself
'<a href=ble.sh></a>',
'<a href=foo.html></a>',
'<foo x="&"></foo>',

# TODO: capitalization should be allowed
#'<META><a></a>',
Expand Down

0 comments on commit 07ebeba

Please sign in to comment.