Skip to content

Commit

Permalink
Merge pull request #4 from zytedata/fast-no-index
Browse files Browse the repository at this point in the history
Improve performance by avoiding indexing
  • Loading branch information
lopuhin authored Apr 24, 2024
2 parents 19b3ab9 + 31c42e8 commit ff4987c
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 24 deletions.
5 changes: 3 additions & 2 deletions clear_html/formatted_text/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from clear_html.formatted_text.headings import normalize_headings_level
from clear_html.formatted_text.utils import (
clean_incomplete_structures,
double_br,
kill_tag_content,
remove_empty_tags,
set_article_tag_as_root,
Expand Down Expand Up @@ -116,10 +117,10 @@ def paragraphy(doc: HtmlElement):
start, end = None, None
for idx, child in enumerate(doc):
if child.tag == "br":
if idx == 0 or doc[idx - 1].tag != "br" or has_tail(doc[idx - 1]):
if not double_br(child.getprevious()):
# A br without previous consecutive br was found
start = idx
if idx == n_children - 1 or doc[idx + 1].tag != "br" or has_tail(child):
if not double_br(child):
# A br without next consecutive br was found
end = idx
if start == end:
Expand Down
34 changes: 17 additions & 17 deletions clear_html/formatted_text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,18 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
# carefully add double brs in some cases to
# respect the separation between text chunks
# Not known html tags are considered as inline elements by default.
idx = parent.index(doc)
doc_prev = doc.getprevious()
doc_next = doc.getnext()

prev_is_inline = (
idx != 0
and parent[idx - 1].tag in PHRASING_CONTENT
and not _double_br(parent, idx - 2, idx - 1)
doc_prev is not None
and doc_prev.tag in PHRASING_CONTENT
and not double_br(doc_prev.getprevious())
)
after_is_inline = (
idx != len(parent) - 1
and parent[idx + 1].tag in PHRASING_CONTENT
and not _double_br(parent, idx + 1, idx + 2)
doc_next is not None
and doc_next.tag in PHRASING_CONTENT
and not double_br(doc_next)
)

has_text_prev = bool(prev_text(doc).strip()) or prev_is_inline
Expand All @@ -177,29 +178,28 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
if has_text_prev and (has_text_inside or has_text_after):
# Insert double br before
for i in range(2):
parent.insert(idx, Element("br"))
idx += 2
doc.addprevious(Element("br"))
if has_text_inside and has_text_after:
# Insert brs after
last_br = Element("br")
last_br.tail = doc.tail
doc.tail = None
parent.insert(idx + 1, last_br)
parent.insert(idx + 1, Element("br"))
doc.addnext(last_br)
doc.addnext(Element("br"))
if preserve_content:
doc.drop_tag()
else:
doc.drop_tree()


def _double_br(doc: HtmlElement, start: int, end: int):
"""True if double br in doc[start:end] (end-start must be 1)"""
if end - start != 1:
def double_br(doc: Optional[HtmlElement]):
"""True if doc and next element are "br" tags without text in between."""
if doc is None or doc.tag != "br":
return False
if not all(idx >= 0 and idx < len(doc) for idx in (start, end)):
doc_next = doc.getnext()
if doc_next is None or doc_next.tag != "br":
return False
both_brs = (doc[start].tag == "br") and (doc[end].tag == "br")
return both_brs and not has_tail(doc[start])
return not has_tail(doc)


def has_no_content(doc: HtmlElement) -> bool:
Expand Down
14 changes: 9 additions & 5 deletions clear_html/lxml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,16 @@ def prev_text(doc: HtmlElement) -> str:
parent = doc.getparent()
if parent is None:
return ""
idx = parent.index(doc)
if idx == 0:
text = parent.text
previous = doc.getprevious()
if previous is None:
parent = doc.getparent()
if parent is None:
text = ""
else:
text = parent.text or ""
else:
text = parent[idx - 1].tail
return text or ""
text = previous.tail or ""
return text


def iter_deep_first_post_order(doc: HtmlElement) -> Generator[HtmlElement, Any, None]:
Expand Down

0 comments on commit ff4987c

Please sign in to comment.