From aee71f136719ab917e3b767c6ef937e8a351767c Mon Sep 17 00:00:00 2001 From: Andy C Date: Sat, 18 Jan 2025 12:15:47 -0500 Subject: [PATCH] [htm8 refactor] Move old TagLexer API out I tried to refactor this, but the old code is crufty and heavily stateful. Gah. I think we need efficient CSS-style selectors for it. There is a bunch of copy and paste across doctools/oils_doc.py doctools/help_gen.py --- build/doc.sh | 28 ++++- data_lang/htm8.py | 240 ++--------------------------------------- data_lang/htm8_test.py | 28 ----- doctools/help_gen.py | 4 +- doctools/oils_doc.py | 8 +- lazylex/html.py | 224 +++++++++++++++++++++++++++++++++++++- lazylex/html_test.py | 29 +++++ 7 files changed, 290 insertions(+), 271 deletions(-) diff --git a/build/doc.sh b/build/doc.sh index 32177f837..03f16ea2f 100755 --- a/build/doc.sh +++ b/build/doc.sh @@ -173,7 +173,7 @@ readonly MARKDOWN_DOCS=( # A better fix would be to implement json_utf8.load(f), which doesn't decode # into unicode instances. This would remove useless conversions. -readonly TIMESTAMP=$(date --rfc-email) +DOC_TIMESTAMP=${DOC_TIMESTAMP:-$(date --rfc-email)} split-and-render() { local src=${1:-doc/known-differences.md} @@ -194,7 +194,7 @@ split-and-render() { local css_files="$web_url/base.css $web_url/manual.css $web_url/toc.css $web_url/language.css $web_url/code.css" PYTHONPATH='.:vendor' doctools/split_doc.py \ - -v build_timestamp="$TIMESTAMP" \ + -v build_timestamp="$DOC_TIMESTAMP" \ -v oil_version="$OIL_VERSION" \ -v css_files="$css_files" \ -v all_docs_url='.' \ @@ -268,7 +268,7 @@ render-only() { "css_files": "$css_files", "all_docs_url": ".", - "build_timestamp": "$TIMESTAMP", + "build_timestamp": "$DOC_TIMESTAMP", "oil_version": "$OIL_VERSION" } EOF @@ -750,5 +750,27 @@ soil-run() { run-for-release } +# +# Golden tests +# +# $0 golden-tree +# $0 determnistic-build # with new code +# $0 compare-golden + +deterministic() { + # build without varying timestamp + DOC_TIMESTAMP='GOLD' $0 soil-run +} + +golden-tree() { + rm -r -f _release/VERSION/ _release/VERSION_gold/ + deterministic + cp -r _release/VERSION/ _release/VERSION_gold +} + +compare-golden() { + diff -r -u _release/VERSION_gold _release/VERSION/ +} + "$@" diff --git a/data_lang/htm8.py b/data_lang/htm8.py index 3e6db5838..b64e69c8d 100644 --- a/data_lang/htm8.py +++ b/data_lang/htm8.py @@ -42,10 +42,9 @@ import re -from typing import Dict, List, Tuple, Optional, IO, Iterator, Any +from typing import Dict, List, Tuple, Optional, IO, Any -from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t, - h8_tag_id_str, attr_name, attr_name_t, +from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, attr_name, attr_name_t, attr_name_str, attr_value_e, attr_value_t, h8_val_id) from doctools.util import log @@ -162,14 +161,15 @@ def MakeLexer(rules): # Lexers # -_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter +_NAME_RE = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter CHAR_LEX = [ # Characters # https://www.w3.org/TR/xml/#sec-references (r'&\# [0-9]+ ;', h8_id.DecChar), (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar), - (r'& %s ;' % _NAME, h8_id.CharEntity), + # TODO: shouldn't use _NAME_RE? Just letters + (r'& %s ;' % _NAME_RE, h8_id.CharEntity), # Allow unquoted, and quoted (r'&', h8_id.BadAmpersand), ] @@ -203,11 +203,11 @@ def MakeLexer(rules): # - We look for a valid tag name, but we don't validate attributes. # That's done in the tag lexer. # - We don't allow leading whitespace - (r'' % _NAME, h8_id.EndTag), + (r'' % _NAME_RE, h8_id.EndTag), # self-closing
comes before StartTag # could/should these be collapsed into one rule? - (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end - (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start + (r'< (%s) [^>\x00]* />' % _NAME_RE, h8_id.StartEndTag), # end + (r'< (%s) [^>\x00]* >' % _NAME_RE, h8_id.StartTag), # start # HTML5 allows unescaped > in raw data, but < is not allowed. # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html @@ -395,7 +395,7 @@ def LookAhead(self, regex): # # If the = is not present, then we set the lexer in a state for # attr_value_e.Missing. - (r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok), + (r'\s+ (%s) \s* (=)? \s*' % _NAME_RE, attr_name.Ok), # unexpected EOF # The closing > or /> is treated as end of stream, and it's not an error. @@ -420,8 +420,6 @@ def LookAhead(self, regex): # it's not common. It opens up the j"" and $"" extensions # # ditto -# TODO: get rid of OLD copy -_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*''' _UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+''' # What comes after = ? @@ -758,223 +756,3 @@ def AllAttrsRaw(attr_lx): v = s[val_start:val_end] pairs.append((n, v)) return pairs - - -# -# OLD API - REMOVE THIS -# - -# Tag names: -# Match ', re.VERBOSE) - -# To match href="foo" -# Note: in HTML5 and XML, single quoted attributes are also valid - -#