use extra_attrs

jsvine · Jul 2, 2024 · 1f57556 · 1f57556
1 parent 76752c4
commit 1f57556
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 28 deletions.
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -559,9 +559,9 @@ def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":
 
     def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
         """
-        Removes duplicate chars — those sharing the same text, fontname, size,
-        and positioning (within `tolerance`) as other characters on the page.
-        Fontname and size properties can be ignored when comparing the chars.
+        Removes duplicate chars — those sharing the same text and positioning
+        (within `tolerance`) as other characters in the set. Use extra_args to
+        be more restrictive with the properties shared by the matching chars.
         """
         p = FilteredPage(self, lambda x: True)
         p._objects = {kind: objs for kind, objs in self.objects.items()}

diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -9,7 +9,6 @@
     Callable,
     Dict,
     Generator,
-    Iterable,
     List,
     Match,
     Optional,
@@ -785,22 +784,14 @@ def extract_text_simple(
 def dedupe_chars(
     chars: T_obj_list,
     tolerance: T_num = 1,
-    ignore_char_properties: Optional[Iterable[str]] = None,
+    extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"),
 ) -> T_obj_list:
     """
-    Removes duplicate chars — those sharing the same text, fontname, size,
-    and positioning (within `tolerance`) as other characters in the set.
-    Fontname and size properties can be ignored when comparing the chars.
+    Removes duplicate chars — those sharing the same text and positioning
+    (within `tolerance`) as other characters in the set. Use extra_args to
+    be more restrictive with the properties shared by the matching chars.
     """
-    char_props = {"text", "fontname", "size", "upright"}
-    if ignore_char_properties is not None:
-        for prop in ignore_char_properties:
-            if prop in ignore_char_properties:
-                char_props.remove(prop)
-            else:
-                raise KeyError(f"Cannot tolerate {prop} in dedupe chars")
-
-    key = itemgetter(*char_props)
+    key = itemgetter(*("upright", "text"), *(extra_attrs or tuple()))
     pos_key = itemgetter("doctop", "x0")
 
     def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:

diff --git a/tests/test_dedupe_chars.py b/tests/test_dedupe_chars.py
@@ -73,7 +73,7 @@ def test_extract_text2(self):
             == "UE 8. Circulation - Métabolismes"
         )
 
-    def test_ignore_char_props(self):
+    def test_extra_attrs(self):
         path = os.path.join(HERE, "pdfs/issue-1114-dedupe-chars.pdf")
         pdf = pdfplumber.open(path)
         page = pdf.pages[0]
@@ -100,12 +100,10 @@ def dup_chars(s: str) -> str:
                     dup_text = text
             gt.append((text, should_dedup, dup_text))
 
-        keys_list = ["no_dedupe", {}, {"size"}, {"fontname"}, {"size", "fontname"}]
+        keys_list = ["no_dedupe", (), ("size",), ("fontname",), ("size", "fontname")]
         for keys in keys_list:
             if keys != "no_dedupe":
-                filtered_page = page.dedupe_chars(
-                    tolerance=2, ignore_char_properties=keys
-                )
+                filtered_page = page.dedupe_chars(tolerance=2, extra_attrs=keys)
             else:
                 filtered_page = page
             for i, line in enumerate(
@@ -116,13 +114,17 @@ def dup_chars(s: str) -> str:
                     should_dedup = False
                 if isinstance(should_dedup, str):
                     if should_dedup in keys:
-                        assert (
-                            line == text
-                        ), f"Should not be duplicated when ignoring {should_dedup}"
+                        fail_msg = (
+                            f"{should_dedup} is not required to match "
+                            "so it should be duplicated"
+                        )
+                        assert line == dup_text, fail_msg
                     else:
-                        assert (
-                            line == dup_text
-                        ), f"{should_dedup} is not ignored so it should be duplicated"
+                        fail_msg = (
+                            "Should not be duplicated "
+                            f"when requiring matching {should_dedup}"
+                        )
+                        assert line == text, fail_msg
                 elif should_dedup:
                     assert line == text
                 else: