Skip to content

Commit

Permalink
[htm8] Work on AttrLexer
Browse files Browse the repository at this point in the history
ReadName() and AttrNameEquals() kinda work.

Now I need to work on lexing values.

I discovered a problem with the previous definition of HTM8.  Something
like:

    <img src=?foo=bar/>

is hard to lex because the /> can be part of a quoted attribute, or it
can be a self-closing tag.

Although right now we are just excluding /.  Hm.  We should probably
enforce this for readability:

    <img src="?foo=bar/">
  • Loading branch information
Andy C committed Jan 16, 2025
1 parent a541a63 commit 5cc8b1b
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 17 deletions.
29 changes: 22 additions & 7 deletions data_lang/htm8.asdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ module htm8

| Invalid
| EndOfStream

# Used by the AttrLexer
#| Whitespace
#| DoubleQuote
#| SingleQuote
generate [no_namespace_suffix] # cosmetic: call it h8_id, not h8_id_e


Expand All @@ -30,15 +35,25 @@ module htm8
generate [no_namespace_suffix]

attr_name =
Ok # Found an attribute
| Done # No more attributes
| Error(int pos) # LexError
Ok # Found an attribute
| Done # No more attributes
| Invalid # e.g. <a !>
generate [no_namespace_suffix]

attr_value_id =
UnquotedVal
| DoubleQuote
| SingleQuote
| Invalid
generate [no_namespace_suffix]

attr_value =
Missing # <a missing>
| Empty # <a empty= >
| Unquoted # <a unquoted=1 >
| Quoted # <a quoted="1" >
Missing # <a missing>
| Empty # <a empty= >
| Unquoted # <a unquoted=1 >
| DoubleQuote # <a quoted="1" >
| SingleQuote # <a quoted="1" >
# No end of stream here, it will just be Missing, and the next attr_name will fail
generate [no_namespace_suffix]

# This API is maybe more natural, but has more allocations
Expand Down
107 changes: 99 additions & 8 deletions data_lang/htm8.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
h8_tag_id_str, attr_name, attr_name_t,
attr_value, attr_value_t)
attr_value, attr_value_t, attr_value_id)
from doctools.util import log


Expand Down Expand Up @@ -414,6 +414,61 @@ def LookAhead(self, regex):
return m is not None


A_NAME_LEX = [
# Leading whitespace is required, to separate attributes.
#
# If the = is not present, then we set the lexer in a state for
# attr_value.Missing.
(r'\s+ (%s) \s* (=)?' % _NAME, attr_name.Ok),
# unexpected EOF

# The closing > or /> is treated as end of stream, and it's not an error.
(r'\s* /? >', attr_name.Done),

# NUL should not be possible, because the top-level

# e.g. < is an error
(r'.', attr_name.Invalid),
]

A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)

# Here we just loop on regular tokens
#
# Examples:
# <a href = unquoted&amp;foo >
# <a href = unquoted&foo > # BadAmpersand is allowed I guess
# <a href ="unquoted&foo" > # double quoted
# <a href ='unquoted&foo' > # single quoted
# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
# it's not common. It opens up the j"" and $"" extensions
# <a href = what'foo' > # ditto
#
# Problem: <a href=foo/> - this is hard to recognize
# Because is the unquoted value "foo/" or "foo" ?

# Be very lenient - just no whitespace or special HTML chars
# I don't think this is more lenient than HTML5, though we should check.
#
# Bug fix: Also disallow /

_UNQUOTED_VALUE = r'''[^ \t\r\n<>&/"'\x00]*'''

A_VALUE_LEX = CHAR_LEX + [
(r'"', attr_value_id.DoubleQuote),
(r"'", attr_value_id.SingleQuote),
(_UNQUOTED_VALUE, attr_value_id.UnquotedVal),

#(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
#(r'[^ \r\n\t&>\x00]', h8_id.RawData),
#(r'[>\x00]', h8_id.EndOfStream),
# e.g. < is an error
(r'.', attr_value_id.Invalid),
]

A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)


class AttrLexer(object):
"""
We can also invert this
Expand Down Expand Up @@ -466,6 +521,11 @@ def __init__(self, s):
self.s = s
self.tag_name_pos = -1 # Invalid
self.tag_end_pos = -1
self.pos = -1

self.name_start = -1
self.name_end = -1
self.next_value_is_missing = False

def Init(self, tag_name_pos, end_pos):
# type: (int, int) -> None
Expand All @@ -485,6 +545,8 @@ def Init(self, tag_name_pos, end_pos):
self.tag_name_pos = tag_name_pos
self.end_pos = end_pos

self.pos = tag_name_pos

def ReadName(self):
# type: () -> Tuple[attr_name_t, int, int]
"""Reads the attribute name
Expand All @@ -497,15 +559,48 @@ def ReadName(self):
<a !>
<a foo=bar !>
"""
return attr_name.Done, -1, -1
for pat, a in A_NAME_LEX_COMPILED:
m = pat.match(self.s, self.pos)
if m:
if a == attr_name.Ok:
#log('%r', m.groups())
self.name_start = m.start(1)
self.name_end = m.end(1)
# Set state based on =
if m.group(2) is None:
self.next_value_is_missing = True
return attr_name.Ok, self.name_start, self.name_end
else:
# Reset state - e.g. you must call AttrNameEquals
self.name_start = -1
self.name_end = -1
self.next_value_is_missing = False

if a == attr_name.Invalid:
return attr_name.Invalid, -1, -1
if a == attr_name.Done:
return attr_name.Done, -1, -1
else:
raise AssertionError('h8_id.Invalid rule should have matched')

def AttrNameEquals(self, s):
def _CanonicalAttrName(self):
# type: () -> str
assert self.name_start >= 0, self.name_start
assert self.name_end >= 0, self.name_end

attr_name = self.s[self.name_start:self.name_end]
if attr_name.islower():
return attr_name
else:
return attr_name.lower()

def AttrNameEquals(self, expected):
# type: (str) -> bool
"""
TODO: Must call this after ReadName() ?
Because that can FAIL.
"""
pass
return expected == self._CanonicalAttrName()

def ReadRawValue(self):
# type: () -> Tuple[attr_value_t, int, int]
Expand Down Expand Up @@ -550,10 +645,6 @@ def ReadValueAndDecode(self):
#
# Allow - for td-attrs

# Be very lenient - just no whitespace or special HTML chars
# I don't think this is more lenient than HTML5, though we should check.
_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''

# TODO: we don't need to capture the tag name here? That's done at the top
# level
_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
Expand Down
61 changes: 59 additions & 2 deletions data_lang/htm8_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,61 @@ def testNoAttrs(self):
self.assertEqual(-1, name_start)
self.assertEqual(-1, name_end)

def testAttr(self):
def testInvalid(self):
h = '<a !>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)

attr_lexer = htm8.AttrLexer(h)
attr_lexer.Init(lx.TagNamePos(), end_pos)

n, name_start, name_end = attr_lexer.ReadName()
self.assertEqual(n, attr_name.Invalid)
self.assertEqual(-1, name_start)
self.assertEqual(-1, name_end)

def testEmpty(self):
h = '<img src=/>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartEndTag, tok_id)

attr_lexer = htm8.AttrLexer(h)
attr_lexer.Init(lx.TagNamePos(), end_pos)

n, name_start, name_end = attr_lexer.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(5, name_start)
self.assertEqual(8, name_end)
self.assertEqual(False, attr_lexer.next_value_is_missing)

self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))

h = '<a href=foo>'
def testMissing(self):
h = '<img SRC/>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartEndTag, tok_id)

attr_lexer = htm8.AttrLexer(h)
attr_lexer.Init(lx.TagNamePos(), end_pos)

n, name_start, name_end = attr_lexer.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(5, name_start)
self.assertEqual(8, name_end)
self.assertEqual(True, attr_lexer.next_value_is_missing)

self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))

def testAttr(self):
h = '<a x=foo>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
Expand All @@ -91,6 +143,11 @@ def testAttr(self):
attr_lexer = htm8.AttrLexer(h)
attr_lexer.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lexer.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

# Note: internal state set according to =


def ValidTokenList(s, no_special_tags=False):
Expand Down

0 comments on commit 5cc8b1b

Please sign in to comment.