Merge pull request #66 from Filimoa/parse-images-pdf-miner

Parse images pdf miner
Filimoa · Sep 24, 2024 · 7971f05 · 7971f05
2 parents f5bd782 + b73f6bb
commit 7971f05
Show file tree

Hide file tree

Showing 7 changed files with 185 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,4 +37,5 @@ notebooks/
 sample-docs/
 weights/
 .env
+uv.lock
 
diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py
@@ -11,6 +11,8 @@
 )
 from openparse.schemas import Node, ParsedDocument, TableElement, TextElement
 
+from openparse.schemas import ImageElement
+
 IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline)
 
 
@@ -125,7 +127,7 @@ def parse(
 
     @staticmethod
     def _elems_to_nodes(
-        elems: Union[List[TextElement], List[TableElement]],
+        elems: Union[List[TextElement], List[TableElement], List[ImageElement]],
     ) -> List[Node]:
         return [
             Node(

diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py
@@ -15,7 +15,6 @@
     r"^(\s*[\-•](?!\*)|\s*\*(?!\*)|\s*\d+\.\s|\s*\([a-zA-Z0-9]+\)\s|\s*[a-zA-Z]\.\s)"
 )
 
-
 ReadingOrder = namedtuple("ReadingOrder", "min_page y_position min_x0")
 
 
@@ -260,7 +259,9 @@ def area(self) -> float:
         return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)
 
     def is_at_similar_height(
-        self, other: Union["TableElement", "TextElement"], error_margin: float = 1
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
     ) -> bool:
         y_distance = abs(self.bbox.y1 - other.bbox.y1)
 
@@ -320,7 +321,64 @@ def tokens(self) -> int:
         return num_tokens(self.text)
 
     def is_at_similar_height(
-        self, other: Union["TableElement", "TextElement"], error_margin: float = 1
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
+    ) -> bool:
+        y_distance = abs(self.bbox.y1 - other.bbox.y1)
+
+        return y_distance <= error_margin
+
+
+######################
+### IMAGE ELEMENTS ###
+######################
+
+
+class ImageElement(BaseModel):
+    text: str
+    bbox: Bbox
+    image: str  # base64 encoded image
+    image_mimetype: Union[
+        Literal[
+            "image/jpeg",
+            "image/png",
+            "image/bmp",
+            "image/jbig2",
+            "image/webp",
+            "unknown",
+        ],
+        str,
+    ]
+    _embed_text: Optional[str] = None
+    variant: Literal[NodeVariant.IMAGE] = NodeVariant.IMAGE
+
+    model_config = ConfigDict(frozen=True)
+
+    @computed_field  # type: ignore
+    @cached_property
+    def embed_text(self) -> str:
+        if self._embed_text:
+            return self._embed_text
+
+        return self.text
+
+    @cached_property
+    def area(self) -> float:
+        return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)
+
+    @cached_property
+    def page(self) -> int:
+        return self.bbox.page
+
+    @cached_property
+    def tokens(self) -> int:
+        return 512  # Placeholder for image tokenization
+
+    def is_at_similar_height(
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
     ) -> bool:
         y_distance = abs(self.bbox.y1 - other.bbox.y1)
 
@@ -362,7 +420,7 @@ class Node(BaseModel):
         description="Unique ID of the node.",
         exclude=True,
     )
-    elements: Tuple[Union[TextElement, TableElement], ...] = Field(
+    elements: Tuple[Union[TextElement, TableElement, ImageElement], ...] = Field(
         exclude=True, frozen=True
     )
     tokenization_lower_limit: int = Field(
@@ -385,14 +443,19 @@ def node_id(self) -> str:
 
     @computed_field  # type: ignore
     @cached_property
-    def variant(self) -> Set[Literal["text", "table"]]:
+    def variant(self) -> Set[Literal["text", "table", "image"]]:
         return {e.variant.value for e in self.elements}
 
     @computed_field  # type: ignore
     @cached_property
     def tokens(self) -> int:
         return sum([e.tokens for e in self.elements])
 
+    @computed_field  # type: ignore
+    @cached_property
+    def images(self) -> List[ImageElement]:
+        return [e for e in self.elements if e.variant == NodeVariant.IMAGE]
+
     @computed_field  # type: ignore
     @cached_property
     def bbox(self) -> List[Bbox]:

diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py
@@ -1,10 +1,19 @@
-from typing import Any, Iterable, List, Tuple, Union
-
-from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine
+import base64
+from io import BytesIO
+from typing import Any, Iterable, List, Optional, Tuple, Union
+
+from pdfminer.layout import (
+    LTAnno,
+    LTChar,
+    LTFigure,
+    LTImage,
+    LTTextContainer,
+    LTTextLine,
+)
 from pydantic import BaseModel, model_validator
 
 from openparse.pdf import Pdf
-from openparse.schemas import Bbox, LineElement, TextElement, TextSpan
+from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan
 
 
 class CharElement(BaseModel):
@@ -54,6 +63,24 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
     return chars
 
 
+def get_mime_type(pdf_object: LTImage) -> Optional[str]:
+    subtype = pdf_object.stream.attrs.get("Subtype", {"name": None}).name
+    filter_ = pdf_object.stream.attrs.get("Filter", {"name": None}).name
+    if subtype == "Image":
+        if filter_ == "DCTDecode":
+            return "image/jpeg"
+        elif filter_ == "FlateDecode":
+            return "image/png"  # Most likely, but could also be TIFF
+        elif filter_ == "JPXDecode":
+            return "image/jp2"
+        elif filter_ == "CCITTFaxDecode":
+            return "image/tiff"
+        elif filter_ == "JBIG2Decode":
+            return "image/jbig2"
+
+    return None
+
+
 def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
     spans = []
     current_text = ""
@@ -115,8 +142,8 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]:
     return x0, y0, x1, y1
 
 
-def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
-    """Parse PDF and return a list of LineElement objects."""
+def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]:
+    """Parse PDF and return a list of TextElement and ImageElement objects."""
     elements = []
     page_layouts = pdf_input.extract_layout_pages()
 
@@ -148,5 +175,27 @@ def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
                         lines=tuple(lines),
                     )
                 )
-
+            elif isinstance(element, LTFigure):
+                for e in element._objs:
+                    if isinstance(e, LTImage):
+                        mime_type = get_mime_type(e)
+                        if mime_type:
+                            img_data = BytesIO(e.stream.get_data()).getvalue()
+                            base64_string = base64.b64encode(img_data).decode("utf-8")
+                            elements.append(
+                                ImageElement(
+                                    bbox=Bbox(
+                                        x0=e.bbox[0],
+                                        y0=e.bbox[1],
+                                        x1=e.bbox[2],
+                                        y1=e.bbox[3],
+                                        page=page_num,
+                                        page_width=page_width,
+                                        page_height=page_height,
+                                    ),
+                                    image=base64_string,
+                                    image_mimetype=mime_type or "unknown",
+                                    text="",
+                                )
+                            )
     return elements
diff --git a/src/tests/sample_data/europe.jpg b/src/tests/sample_data/europe.jpg
diff --git a/src/tests/sample_data/pdf-with-image.pdf b/src/tests/sample_data/pdf-with-image.pdf
diff --git a/src/tests/text/pdf_miner/test_core.py b/src/tests/text/pdf_miner/test_core.py
@@ -1,16 +1,21 @@
-from typing import Tuple, List
+import base64
+import io
+from pathlib import Path
+from typing import List, Tuple
 from unittest.mock import MagicMock
 
 from pdfminer.layout import LTAnno, LTChar
+from PIL import Image, ImageChops
 
-from openparse.schemas import TextSpan
+from openparse.pdf import Pdf
+from openparse.schemas import NodeVariant, TextSpan
 from openparse.text.pdfminer.core import (
     CharElement,
-    _group_chars_into_spans,
     _extract_chars,
+    _group_chars_into_spans,
+    ingest,
 )
 
-
 raw_chars = [
     CharElement(text="1", fontname="bold", size=9.0),
     CharElement(text=".", fontname="bold", size=9.0),
@@ -198,3 +203,51 @@ def test_extract_chars_with_ltannos():
 
     # Assert the result matches the expected output
     assert result == expected_output
+
+
+def _images_are_similar(img1_bytes, img2_bytes, max_pct_diff=1.0, pixel_threshold=10):
+    """
+    Compare two images and determine if the percentage of differing pixels is below a threshold.
+
+    :param img1_bytes: Byte content of the first image.
+    :param img2_bytes: Byte content of the second image.
+    :param max_pct_diff: Maximum allowed percentage of differing pixels.
+    :param pixel_threshold: Per-pixel difference threshold to consider a pixel as different.
+    :return: Boolean indicating if images are similar within the allowed percentage difference.
+    """
+    img1 = Image.open(io.BytesIO(img1_bytes)).convert("RGB")
+    img2 = Image.open(io.BytesIO(img2_bytes)).convert("RGB")
+
+    if img1.size != img2.size:
+        print(f"Image sizes do not match: {img1.size} vs {img2.size}")
+        return False
+
+    diff = ImageChops.difference(img1, img2)
+
+    diff_gray = diff.convert("L")
+
+    differing_pixels = sum(
+        1 for pixel in diff_gray.getdata() if pixel > pixel_threshold
+    )
+    total_pixels = img1.size[0] * img1.size[1]
+    pct_diff = (differing_pixels / total_pixels) * 100
+
+    print(f"Percentage of differing pixels: {pct_diff:.2f}%")
+    return pct_diff <= max_pct_diff
+
+
+def test_parse_pdf_with_images():
+    doc_with_image_path = Path("src/tests/sample_data/pdf-with-image.pdf")
+    pdf = Pdf(doc_with_image_path)
+
+    elems = ingest(pdf)
+    assert elems[-1].variant == NodeVariant.IMAGE
+    assert elems[-1].image_mimetype == "image/jpeg"
+    extracted_image_data = base64.b64decode(elems[-1].image)
+
+    # Read the raw image data
+    raw_image_path = Path("src/tests/sample_data/europe.jpg")
+    with raw_image_path.open("rb") as img_file:
+        raw_image_data = img_file.read()
+
+    assert _images_are_similar(raw_image_data, extracted_image_data)
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,4 +37,5 @@ notebooks/ @@
     sample-docs/
     weights/
     .env
+    uv.lock