Skip to content

Commit

Permalink
[htm8 refactor] Cleanup, test refactoring, API design
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy C committed Jan 16, 2025
1 parent def622c commit f167146
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 103 deletions.
56 changes: 27 additions & 29 deletions data_lang/htm8.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,8 @@ def LookAhead(self, regex):
#
# For now, I guess we live with <a href=?foo/>

A_VALUE_LEX = CHAR_LEX + [
# What comes after = ?
A_VALUE_LEX = [
(r'"', h8_val_id.DoubleQuote),
(r"'", h8_val_id.SingleQuote),
(_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
Expand All @@ -478,6 +479,18 @@ def LookAhead(self, regex):

A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)

# What's inside "" or '' ?
QUOTED_VALUE_LEX = CHAR_LEX + [
(r'"', h8_id.DoubleQuote),
(r"'", h8_id.SingleQuote),
(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
(r'''[^"'<>&\x00]+''', h8_id.RawData),
# This includes > - it is not BadGreaterThan because it's NOT recoverable
(r'.', h8_id.Invalid),
]

QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)


class AttrLexer(object):
"""
Expand All @@ -490,7 +503,7 @@ class AttrLexer(object):
print('div')
# TODO: also pass Optional[List[]] out_tokens?
v, start_pos, end_pos = attr_lx.ReadRawValue()
v, start_pos, end_pos = attr_lx.ReadValue()
"""

def __init__(self, s):
Expand Down Expand Up @@ -582,21 +595,21 @@ def AttrNameEquals(self, expected):
return expected == self._CanonicalAttrName()

def _QuotedRead(self):
# type: () -> Tuple[h8_id, end_pos]
# type: () -> Tuple[h8_id_t, int]

for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
m = pat.match(self.s, self.pos)
if m:
end_pos = m.end(0) # Advance
log('_QuotedRead %r', self.s[self.pos:end_pos])
#log('_QuotedRead %r', self.s[self.pos:end_pos])
return tok_id, end_pos
else:
context = self.s[self.pos:self.pos + 10]
raise AssertionError('h8_id.Invalid rule should have matched %r' %
context)

def ReadRawValue(self):
# type: () -> Tuple[attr_value_t, int, int]
def ReadValue(self, tokens_out=None):
# type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
"""Read the attribute value.
In general, it is escaped or "raw"
Expand All @@ -621,9 +634,13 @@ def ReadRawValue(self):
self.pos = m.end(0) # Advance

#log('m %s', m.groups())

# Note: Unquoted value can't contain &amp; etc. now, so there
# is no unquoting, and no respecting tokens_raw.
if a == h8_val_id.UnquotedVal:
return attr_value_e.Unquoted, m.start(0), m.end(0)

# TODO: respect tokens_out
if a == h8_val_id.DoubleQuote:
left_inner = self.pos
while True:
Expand All @@ -634,8 +651,8 @@ def ReadRawValue(self):
return attr_value_e.DoubleQuoted, left_inner, self.pos
self.pos = q_end_pos # advance

# TODO: respect tokens_out
if a == h8_val_id.SingleQuote:

left_inner = self.pos
while True:
tok_id, q_end_pos = self._QuotedRead()
Expand All @@ -651,18 +668,10 @@ def ReadRawValue(self):
else:
raise AssertionError('h8_val_id.NoMatch rule should have matched')

def SkipValue(self):
# type: () -> None
# Just ignore it and return
self.ReadRawValue()

def ReadValueAndDecode(self):
# type: () -> str
"""Read the attribute vlaue
"""
# TODO: tokenize it
pass

#
# OLD API - REMOVE THIS
#

# Tag names:
# Match <a or </a
Expand Down Expand Up @@ -894,17 +903,6 @@ def Tokens(self):

ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)

QUOTED_VALUE_LEX = CHAR_LEX + [
(r'"', h8_id.DoubleQuote),
(r"'", h8_id.SingleQuote),
(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
(r'''[^"'<>&\x00]+''', h8_id.RawData),
# This includes > - it is not BadGreaterThan because it's NOT recoverable
(r'.', h8_id.Invalid),
]

QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)


class AttrValueLexer(object):
"""
Expand Down
93 changes: 32 additions & 61 deletions data_lang/htm8_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import unittest
import re

from typing import List, Tuple
from typing import List, Tuple, Any

from data_lang import htm8
from doctools.util import log
Expand Down Expand Up @@ -54,6 +54,20 @@ def testFindLineNum(self):
print(line_num)


def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
# type: (Any, str) -> htm8.AttrLexer

lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
t.assertEqual(expected_tag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)

return attr_lx


class AttrLexerTest(unittest.TestCase):

def testNoAttrs(self):
Expand Down Expand Up @@ -82,43 +96,31 @@ def testNoAttrs(self):
self.assertEqual(-1, name_end)

try:
result = attr_lx.ReadRawValue()
result = attr_lx.ReadValue()
except AssertionError as e:
print(e)
else:
self.fail('should have failed')

def testInvalid(self):
h = '<a !>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
attr_lx = _MakeAttrLexer(self, h)

n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Invalid)
self.assertEqual(-1, name_start)
self.assertEqual(-1, name_end)

try:
result = attr_lx.ReadRawValue()
result = attr_lx.ReadValue()
except AssertionError as e:
print(e)
else:
self.fail('should have failed')

def testEmpty(self):
h = '<img src=/>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartEndTag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)

n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
Expand All @@ -129,21 +131,15 @@ def testEmpty(self):
self.assertEqual(True, attr_lx.AttrNameEquals('src'))
self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))

v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()
log('v = %s', attr_value_str(v))
self.assertEqual(attr_value_e.Empty, v)
self.assertEqual(-1, attr_start)
self.assertEqual(-1, attr_end)

def testMissing(self):
h = '<img SRC/>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartEndTag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)

n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
Expand All @@ -154,27 +150,22 @@ def testMissing(self):
self.assertEqual(True, attr_lx.AttrNameEquals('src'))
self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))

v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()
self.assertEqual(attr_value_e.Missing, v)
self.assertEqual(-1, attr_start)
self.assertEqual(-1, attr_end)

def testUnquoted(self):
# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
h = '<a x=foo/>'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartEndTag, tok_id)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()

log('v = %s', attr_value_str(v))
log('unquoted val %r', h[attr_start:attr_end])
Expand All @@ -185,19 +176,14 @@ def testUnquoted(self):

def testDoubleQuoted(self):
h = '<a x="f&">'
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()

log('v = %s', attr_value_str(v))
log('val %r', h[attr_start:attr_end])
Expand All @@ -208,19 +194,14 @@ def testDoubleQuoted(self):

def testSingleQuoted(self):
h = "<a x='&f'>"
lx = htm8.Lexer(h)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()

log('v = %s', attr_value_str(v))
log('unquoted val %r', h[attr_start:attr_end])
Expand All @@ -231,41 +212,31 @@ def testSingleQuoted(self):

def testDoubleQuoted_Bad(self):
h = '<a x="foo>'
lx = htm8.Lexer(h)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

try:
v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()
except htm8.LexError as e:
print(e)
else:
self.fail('Expected LexError')

def testSingleQuoted_Bad(self):
h = "<a x='foo>"
lx = htm8.Lexer(h)

tok_id, end_pos = lx.Read()
self.assertEqual(h8_id.StartTag, tok_id)
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

attr_lx = htm8.AttrLexer(h)
attr_lx.Init(lx.TagNamePos(), end_pos)
n, name_start, name_end = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)

try:
v, attr_start, attr_end = attr_lx.ReadRawValue()
v, attr_start, attr_end = attr_lx.ReadValue()
except htm8.LexError as e:
print(e)
else:
Expand Down
Loading

0 comments on commit f167146

Please sign in to comment.