facebook · Dcallies · Nov 27, 2024 · Nov 4, 2024 · Nov 13, 2024 · Nov 13, 2024
@@ -53,6 +53,7 @@ def init_argparse(cls, settings: CLISettings, ap: argparse.ArgumentParser) -> No
         signal_choices = sorted(
             s.get_name() for s in signal_types if issubclass(s, FileHasher)
         )
+
         ap.add_argument(
             "content_type",
             **common.argparse_choices_pre_type_kwargs(
@@ -80,25 +81,50 @@ def init_argparse(cls, settings: CLISettings, ap: argparse.ArgumentParser) -> No
         )
 
         ap.add_argument(
-            "--rotations",
-            "--R",
+            "--photo-preprocess",
+            choices=["unletterbox", "rotations"],
+            help=(
+                "Apply one of the preprocessing steps to the image before hashing. "
+                "'unletterbox' removes black borders, and 'rotations' generates all 8 "
+                "simple rotations."
+            ),
+        )
+
+        ap.add_argument(
+            "--black-threshold",
+            type=int,
+            default=10,
+            help=(
+                "Set the black threshold for unletterboxing (default: 5)."
+                "Only applies when 'unletterbox' is selected in --preprocess."
+            ),
+        )
+
+        ap.add_argument(
+            "--save-output",
             action="store_true",
-            help="for photos, generate all 8 simple rotations",
+            help="If true, saves the processed image as a new file.",
         )
 
     def __init__(
         self,
         content_type: t.Type[ContentType],
         signal_type: t.Optional[t.Type[SignalType]],
         files: t.List[pathlib.Path],
-        rotations: bool = False,
+        photo_preprocess: t.Optional[str] = None,
+        black_threshold: int = 0,
+        save_output: bool = False,
     ) -> None:
         self.content_type = content_type
         self.signal_type = signal_type
-
+        self.photo_preprocess = photo_preprocess
+        self.black_threshold = black_threshold
+        self.save_output = save_output
         self.files = files
-
-        self.rotations = rotations
+        if self.photo_preprocess and not issubclass(self.content_type, PhotoContent):
+            raise CommandError(
+                "--photo-preprocess flag is only available for Photo content type", 2
+            )
 
     def execute(self, settings: CLISettings) -> None:
         hashers = [
@@ -115,28 +141,44 @@ def execute(self, settings: CLISettings) -> None:
 
             hashers = [self.signal_type]  # type: ignore  # can't detect intersection types
 
-        if not self.rotations:
+        if self.photo_preprocess:
             for file in self.files:
-                for hasher in hashers:
-                    hash_str = hasher.hash_from_file(file)
-                    if hash_str:
-                        print(hasher.get_name(), hash_str)
-            return
-
-        if not issubclass(self.content_type, PhotoContent):
-            raise CommandError(
-                "--rotations flag is only available for Photo content type", 2
-            )
-
-        for file in self.files:
-            with open(file, "rb") as f:
-                image_bytes = f.read()
-                rotated_images = PhotoContent.all_simple_rotations(image_bytes)
-                for rotation_type, rotated_bytes in rotated_images.items():
-                    with tempfile.NamedTemporaryFile() as temp_file:  # Create a temporary file to hold the byte data
-                        temp_file.write(rotated_bytes)
+                updated_bytes: t.List[bytes] = []
+                rotation_type = []
+                if self.photo_preprocess == "unletterbox":
+                    updated_bytes.append(
+                        PhotoContent.unletterbox(str(file), self.black_threshold)
+                    )
+                elif self.photo_preprocess == "rotations":
+                    with open(file, "rb") as f:
+                        image_bytes = f.read()
+                        rotations = PhotoContent.all_simple_rotations(image_bytes)
+                        rotation_type, updated_bytes = list(rotations.keys()), list(
+                            rotations.values()
+                        )
+                for idx, bytes_data in enumerate(updated_bytes):
+                    with tempfile.NamedTemporaryFile() as temp_file:
+                        temp_file.write(bytes_data)
                         temp_file_path = pathlib.Path(temp_file.name)
                         for hasher in hashers:
                             hash_str = hasher.hash_from_file(temp_file_path)
                             if hash_str:
-                                print(rotation_type.name, hasher.get_name(), hash_str)
+                                print(
+                                    f"{rotation_type[idx].name if rotation_type else ''} {hasher.get_name()} {hash_str}"
+                                )
+                    if self.save_output:
+                        suffix = (
+                            f"_{rotation_type[idx].name}"
+                            if rotation_type
+                            else "_unletterboxed"
+                        )
+                        output_path = file.with_stem(f"{file.stem}{suffix}")
+                        with open(output_path, "wb") as output_file:
+                            output_file.write(bytes_data)
+                        print(f"Processed image saved to: {output_path}")
+        else:
+            for file in self.files:
+                for hasher in hashers:
+                    hash_str = hasher.hash_from_file(file)
+                    if hash_str:
+                        print(hasher.get_name(), hash_str)
@@ -83,8 +83,8 @@ def test_rotations_with_non_photo_content(
     """Test that rotation flag raises error with non-photo content"""
     for content_type in ["url", "text", "video"]:
         hash_cli.assert_cli_usage_error(
-            ("--rotations", content_type, str(tmp_file)),
-            msg_regex="--rotations flag is only available for Photo content type",
+            ("--photo-preprocess=rotations", content_type, str(tmp_file)),
+            msg_regex="--photo-preprocess flag is only available for Photo content type",
         )
 
 
@@ -93,7 +93,7 @@ def test_rotations_with_photo_content(hash_cli: ThreatExchangeCLIE2eHelper):
     test_file = pathlib.Path("threatexchange/tests/hashing/resources/LA.png")
 
     hash_cli.assert_cli_output(
-        ("--rotations", "photo", str(test_file)),
+        ("--photo-preprocess=rotations", "photo", str(test_file)),
         [
             "ORIGINAL pdq accb6d39648035f8125c8ce6ba65007de7b54c67a2d93ef7b8f33b0611306715",
             "ROTATE90 pdq 1f70cbbc77edc5f9524faa1b18f3b76cd0a04a833e20f645d229d0acc8499c56",
@@ -105,3 +105,49 @@ def test_rotations_with_photo_content(hash_cli: ThreatExchangeCLIE2eHelper):
             "FLIPMINUS1 pdq 5bb15db9e8a1f03c174a380a55aeaa2985bde9c60abce301bde48df918b5c15b",
         ],
     )
+
+
+def test_unletterbox_with_non_photo_content(
+    hash_cli: ThreatExchangeCLIE2eHelper, tmp_file: pathlib.Path
+):
+    """Test that unletterbox flag raises error with non-photo content"""
+    for content_type in ["url", "text", "video"]:
+        hash_cli.assert_cli_usage_error(
+            ("--photo-preprocess=unletterbox", content_type, str(tmp_file)),
+            msg_regex="--photo-preprocess flag is only available for Photo content type",
+        )
+
+
+def test_unletterbox_with_photo_content(hash_cli: ThreatExchangeCLIE2eHelper):
+    """Test that photo unletterboxing is properly processed"""
+    test_file = pathlib.Path(
+        "threatexchange/tests/hashing/resources/letterboxed_sample-b.jpg"
+    )
+    clean_file = pathlib.Path("threatexchange/tests/hashing/resources/sample-b.jpg")
+
+    hash_cli.assert_cli_output(
+        ("photo", str(clean_file)),
+        [
+            "pdq f8f8f0cee0f4a84f06370a22038f63f0b36e2ed596621e1d33e6b39c4e9c9b22",
+        ],
+    )
+
+    """Test that photo unletterboxing is chnaged based off of allowed threshold"""
+    hash_cli.assert_cli_output(
+        ("--photo-preprocess=unletterbox", "photo", str(test_file)),
+        [
+            "pdq 58f870cce0f4e84d8e378a32028f63f4b36e26f597621e1d33e6b39c4a9c9b22",
+        ],
+    )
+
+    hash_cli.assert_cli_output(
+        (
+            "--photo-preprocess=unletterbox",
+            "--black-threshold=25",
+            "photo",
+            str(test_file),
+        ),
+        [
+            "pdq f8f8f0cee0f4a84f06370a22038f63f0b36e2ed596621e1d33e6b39c4e9c9b22",
+        ],
+    )
@@ -5,10 +5,12 @@
 Wrapper around the video content type.
 """
 from PIL import Image
+from pathlib import Path
 import io
 import typing as t
 
 from .content_base import ContentType, RotationType
+from threatexchange.content_type.preprocess import unletterboxing
 
 
 class PhotoContent(ContentType):
@@ -102,3 +104,22 @@ def all_simple_rotations(cls, image_data: bytes) -> t.Dict[RotationType, bytes]:
             RotationType.FLIPMINUS1: cls.flip_minus1(image_data),
         }
         return rotations
+
+    @classmethod
+    def unletterbox(cls, file_path: str, black_threshold: int = 0) -> bytes:
+        """
+        Remove black letterbox borders from the sides and top of the image based on the specified black_threshold.
+        Returns the cleaned image as raw bytes.
+        """
+        with Image.open(file_path) as image:
+            top = unletterboxing.detect_top_border(image, black_threshold)
+            bottom = unletterboxing.detect_bottom_border(image, black_threshold)
+            left = unletterboxing.detect_left_border(image, black_threshold)
+            right = unletterboxing.detect_right_border(image, black_threshold)
+
+            width, height = image.size
+            cropped_img = image.crop((left, top, width - right, height - bottom))
+
+            with io.BytesIO() as buffer:
+                cropped_img.save(buffer, format=image.format)
+                return buffer.getvalue()
@@ -0,0 +1,69 @@
+from PIL import Image
+
+
+def is_pixel_black(pixel, threshold):
+    """
+    Check if each color channel in the pixel is below the threshold
+    """
+    r, g, b = pixel
+    return r < threshold and g < threshold and b < threshold
+
+
+def detect_top_border(image: Image.Image, black_threshold: int = 0) -> int:
+    """
+    Detect the top black border by counting rows with only black pixels.
+    Checks each RGB channel of each pixel in each row.
+    Returns the first row that is not all black from the top.
+    """
+    width, height = image.size
+    for y in range(height):
+        row_pixels = list(image.crop((0, y, width, y + 1)).getdata())
+        if all(is_pixel_black(pixel, black_threshold) for pixel in row_pixels):
+            continue
+        return y
+    return height
+
+
+def detect_bottom_border(image: Image.Image, black_threshold: int = 0) -> int:
+    """
+    Detect the bottom black border by counting rows with only black pixels from the bottom up.
+    Checks each RGB channel of each pixel in each row.
+    Returns the first row that is not all black from the bottom.
+    """
+    width, height = image.size
+    for y in range(height - 1, -1, -1):
+        row_pixels = list(image.crop((0, y, width, y + 1)).getdata())
+        if all(is_pixel_black(pixel, black_threshold) for pixel in row_pixels):
+            continue
+        return height - y - 1
+    return height
+
+
+def detect_left_border(image: Image.Image, black_threshold: int = 0) -> int:
+    """
+    Detect the left black border by counting columns with only black pixels.
+    Checks each RGB channel of each pixel in each column.
+    Returns the first column from the left that is not all black.
+    """
+    width, height = image.size
+    for x in range(width):
+        col_pixels = list(image.crop((x, 0, x + 1, height)).getdata())
+        if all(is_pixel_black(pixel, black_threshold) for pixel in col_pixels):
+            continue
+        return x
+    return width
+
+
+def detect_right_border(image: Image.Image, black_threshold: int = 0) -> int:
+    """
+    Detect the right black border by counting columns with only black pixels from the right.
+    Checks each RGB channel of each pixel in each column.
+    Returns the first column from the right that is not all black.
+    """
+    width, height = image.size
+    for x in range(width - 1, -1, -1):
+        col_pixels = list(image.crop((x, 0, x + 1, height)).getdata())
+        if all(is_pixel_black(pixel, black_threshold) for pixel in col_pixels):
+            continue
+        return width - x - 1
+    return width