Skip to content

Commit

Permalink
don't use indexing for paragraphy
Browse files Browse the repository at this point in the history
  • Loading branch information
lopuhin committed Apr 24, 2024
1 parent c764dfd commit 27f4c0f
Showing 1 changed file with 7 additions and 10 deletions.
17 changes: 7 additions & 10 deletions clear_html/formatted_text/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,17 @@ def paragraphy(doc: HtmlElement):
when possible. Document is updated inline.
"""
# Let's detect the sequences of consecutive br
children = list(doc)
n_children = len(children)
n_children = len(doc)
br_sequences: List[Tuple[int, int]] = []
start, end = None, None
for idx, child in enumerate(children):
for idx, child in enumerate(doc):
if child.tag == "br":
if idx == 0 or children[idx - 1].tag != "br" or has_tail(children[idx - 1]):
prev_child = child.getprevious()
if prev_child is None or prev_child.tag != "br" or has_tail(prev_child):
# A br without previous consecutive br was found
start = idx
if (
idx == n_children - 1
or children[idx + 1].tag != "br"
or has_tail(child)
):
next_child = child.getnext()
if next_child is None or next_child.tag != "br" or has_tail(child):
# A br without next consecutive br was found
end = idx
if start == end:
Expand All @@ -142,7 +139,7 @@ def paragraphy(doc: HtmlElement):

# Let's split the node into different paragraphs
br_sequences.append((n_children, n_children)) # To get last chunk included
children = [copy.copy(c) for c in children]
children = [copy.copy(c) for c in doc]
del doc[:n_children]

last_inline_chunk: List[HtmlElement] = []
Expand Down

0 comments on commit 27f4c0f

Please sign in to comment.