Skip to content

Commit

Permalink
use extra_attrs
Browse files Browse the repository at this point in the history
  • Loading branch information
QAN committed Jul 2, 2024
1 parent 76752c4 commit 1f57556
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 28 deletions.
6 changes: 3 additions & 3 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,9 @@ def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":

def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
"""
Removes duplicate chars — those sharing the same text, fontname, size,
and positioning (within `tolerance`) as other characters on the page.
Fontname and size properties can be ignored when comparing the chars.
Removes duplicate chars — those sharing the same text and positioning
(within `tolerance`) as other characters in the set. Use extra_args to
be more restrictive with the properties shared by the matching chars.
"""
p = FilteredPage(self, lambda x: True)
p._objects = {kind: objs for kind, objs in self.objects.items()}
Expand Down
19 changes: 5 additions & 14 deletions pdfplumber/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
Callable,
Dict,
Generator,
Iterable,
List,
Match,
Optional,
Expand Down Expand Up @@ -785,22 +784,14 @@ def extract_text_simple(
def dedupe_chars(
chars: T_obj_list,
tolerance: T_num = 1,
ignore_char_properties: Optional[Iterable[str]] = None,
extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"),
) -> T_obj_list:
"""
Removes duplicate chars — those sharing the same text, fontname, size,
and positioning (within `tolerance`) as other characters in the set.
Fontname and size properties can be ignored when comparing the chars.
Removes duplicate chars — those sharing the same text and positioning
(within `tolerance`) as other characters in the set. Use extra_args to
be more restrictive with the properties shared by the matching chars.
"""
char_props = {"text", "fontname", "size", "upright"}
if ignore_char_properties is not None:
for prop in ignore_char_properties:
if prop in ignore_char_properties:
char_props.remove(prop)
else:
raise KeyError(f"Cannot tolerate {prop} in dedupe chars")

key = itemgetter(*char_props)
key = itemgetter(*("upright", "text"), *(extra_attrs or tuple()))
pos_key = itemgetter("doctop", "x0")

def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
Expand Down
24 changes: 13 additions & 11 deletions tests/test_dedupe_chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_extract_text2(self):
== "UE 8. Circulation - Métabolismes"
)

def test_ignore_char_props(self):
def test_extra_attrs(self):
path = os.path.join(HERE, "pdfs/issue-1114-dedupe-chars.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
Expand All @@ -100,12 +100,10 @@ def dup_chars(s: str) -> str:
dup_text = text
gt.append((text, should_dedup, dup_text))

keys_list = ["no_dedupe", {}, {"size"}, {"fontname"}, {"size", "fontname"}]
keys_list = ["no_dedupe", (), ("size",), ("fontname",), ("size", "fontname")]
for keys in keys_list:
if keys != "no_dedupe":
filtered_page = page.dedupe_chars(
tolerance=2, ignore_char_properties=keys
)
filtered_page = page.dedupe_chars(tolerance=2, extra_attrs=keys)
else:
filtered_page = page
for i, line in enumerate(
Expand All @@ -116,13 +114,17 @@ def dup_chars(s: str) -> str:
should_dedup = False
if isinstance(should_dedup, str):
if should_dedup in keys:
assert (
line == text
), f"Should not be duplicated when ignoring {should_dedup}"
fail_msg = (
f"{should_dedup} is not required to match "
"so it should be duplicated"
)
assert line == dup_text, fail_msg
else:
assert (
line == dup_text
), f"{should_dedup} is not ignored so it should be duplicated"
fail_msg = (
"Should not be duplicated "
f"when requiring matching {should_dedup}"
)
assert line == text, fail_msg
elif should_dedup:
assert line == text
else:
Expand Down

0 comments on commit 1f57556

Please sign in to comment.