Skip to content

Commit

Permalink
Merge pull request #66 from Filimoa/parse-images-pdf-miner
Browse files Browse the repository at this point in the history
Parse images pdf miner
  • Loading branch information
Filimoa authored Sep 24, 2024
2 parents f5bd782 + b73f6bb commit 7971f05
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ notebooks/
sample-docs/
weights/
.env
uv.lock

4 changes: 3 additions & 1 deletion src/openparse/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
)
from openparse.schemas import Node, ParsedDocument, TableElement, TextElement

from openparse.schemas import ImageElement

IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline)


Expand Down Expand Up @@ -125,7 +127,7 @@ def parse(

@staticmethod
def _elems_to_nodes(
elems: Union[List[TextElement], List[TableElement]],
elems: Union[List[TextElement], List[TableElement], List[ImageElement]],
) -> List[Node]:
return [
Node(
Expand Down
73 changes: 68 additions & 5 deletions src/openparse/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
r"^(\s*[\-•](?!\*)|\s*\*(?!\*)|\s*\d+\.\s|\s*\([a-zA-Z0-9]+\)\s|\s*[a-zA-Z]\.\s)"
)


ReadingOrder = namedtuple("ReadingOrder", "min_page y_position min_x0")


Expand Down Expand Up @@ -260,7 +259,9 @@ def area(self) -> float:
return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)

def is_at_similar_height(
self, other: Union["TableElement", "TextElement"], error_margin: float = 1
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

Expand Down Expand Up @@ -320,7 +321,64 @@ def tokens(self) -> int:
return num_tokens(self.text)

def is_at_similar_height(
self, other: Union["TableElement", "TextElement"], error_margin: float = 1
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

return y_distance <= error_margin


######################
### IMAGE ELEMENTS ###
######################


class ImageElement(BaseModel):
text: str
bbox: Bbox
image: str # base64 encoded image
image_mimetype: Union[
Literal[
"image/jpeg",
"image/png",
"image/bmp",
"image/jbig2",
"image/webp",
"unknown",
],
str,
]
_embed_text: Optional[str] = None
variant: Literal[NodeVariant.IMAGE] = NodeVariant.IMAGE

model_config = ConfigDict(frozen=True)

@computed_field # type: ignore
@cached_property
def embed_text(self) -> str:
if self._embed_text:
return self._embed_text

return self.text

@cached_property
def area(self) -> float:
return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)

@cached_property
def page(self) -> int:
return self.bbox.page

@cached_property
def tokens(self) -> int:
return 512 # Placeholder for image tokenization

def is_at_similar_height(
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

Expand Down Expand Up @@ -362,7 +420,7 @@ class Node(BaseModel):
description="Unique ID of the node.",
exclude=True,
)
elements: Tuple[Union[TextElement, TableElement], ...] = Field(
elements: Tuple[Union[TextElement, TableElement, ImageElement], ...] = Field(
exclude=True, frozen=True
)
tokenization_lower_limit: int = Field(
Expand All @@ -385,14 +443,19 @@ def node_id(self) -> str:

@computed_field # type: ignore
@cached_property
def variant(self) -> Set[Literal["text", "table"]]:
def variant(self) -> Set[Literal["text", "table", "image"]]:
return {e.variant.value for e in self.elements}

@computed_field # type: ignore
@cached_property
def tokens(self) -> int:
return sum([e.tokens for e in self.elements])

@computed_field # type: ignore
@cached_property
def images(self) -> List[ImageElement]:
return [e for e in self.elements if e.variant == NodeVariant.IMAGE]

@computed_field # type: ignore
@cached_property
def bbox(self) -> List[Bbox]:
Expand Down
63 changes: 56 additions & 7 deletions src/openparse/text/pdfminer/core.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from typing import Any, Iterable, List, Tuple, Union

from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine
import base64
from io import BytesIO
from typing import Any, Iterable, List, Optional, Tuple, Union

from pdfminer.layout import (
LTAnno,
LTChar,
LTFigure,
LTImage,
LTTextContainer,
LTTextLine,
)
from pydantic import BaseModel, model_validator

from openparse.pdf import Pdf
from openparse.schemas import Bbox, LineElement, TextElement, TextSpan
from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan


class CharElement(BaseModel):
Expand Down Expand Up @@ -54,6 +63,24 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
return chars


def get_mime_type(pdf_object: LTImage) -> Optional[str]:
subtype = pdf_object.stream.attrs.get("Subtype", {"name": None}).name
filter_ = pdf_object.stream.attrs.get("Filter", {"name": None}).name
if subtype == "Image":
if filter_ == "DCTDecode":
return "image/jpeg"
elif filter_ == "FlateDecode":
return "image/png" # Most likely, but could also be TIFF
elif filter_ == "JPXDecode":
return "image/jp2"
elif filter_ == "CCITTFaxDecode":
return "image/tiff"
elif filter_ == "JBIG2Decode":
return "image/jbig2"

return None


def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
spans = []
current_text = ""
Expand Down Expand Up @@ -115,8 +142,8 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]:
return x0, y0, x1, y1


def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
"""Parse PDF and return a list of LineElement objects."""
def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]:
"""Parse PDF and return a list of TextElement and ImageElement objects."""
elements = []
page_layouts = pdf_input.extract_layout_pages()

Expand Down Expand Up @@ -148,5 +175,27 @@ def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
lines=tuple(lines),
)
)

elif isinstance(element, LTFigure):
for e in element._objs:
if isinstance(e, LTImage):
mime_type = get_mime_type(e)
if mime_type:
img_data = BytesIO(e.stream.get_data()).getvalue()
base64_string = base64.b64encode(img_data).decode("utf-8")
elements.append(
ImageElement(
bbox=Bbox(
x0=e.bbox[0],
y0=e.bbox[1],
x1=e.bbox[2],
y1=e.bbox[3],
page=page_num,
page_width=page_width,
page_height=page_height,
),
image=base64_string,
image_mimetype=mime_type or "unknown",
text="",
)
)
return elements
Binary file added src/tests/sample_data/europe.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/tests/sample_data/pdf-with-image.pdf
Binary file not shown.
61 changes: 57 additions & 4 deletions src/tests/text/pdf_miner/test_core.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
from typing import Tuple, List
import base64
import io
from pathlib import Path
from typing import List, Tuple
from unittest.mock import MagicMock

from pdfminer.layout import LTAnno, LTChar
from PIL import Image, ImageChops

from openparse.schemas import TextSpan
from openparse.pdf import Pdf
from openparse.schemas import NodeVariant, TextSpan
from openparse.text.pdfminer.core import (
CharElement,
_group_chars_into_spans,
_extract_chars,
_group_chars_into_spans,
ingest,
)


raw_chars = [
CharElement(text="1", fontname="bold", size=9.0),
CharElement(text=".", fontname="bold", size=9.0),
Expand Down Expand Up @@ -198,3 +203,51 @@ def test_extract_chars_with_ltannos():

# Assert the result matches the expected output
assert result == expected_output


def _images_are_similar(img1_bytes, img2_bytes, max_pct_diff=1.0, pixel_threshold=10):
"""
Compare two images and determine if the percentage of differing pixels is below a threshold.
:param img1_bytes: Byte content of the first image.
:param img2_bytes: Byte content of the second image.
:param max_pct_diff: Maximum allowed percentage of differing pixels.
:param pixel_threshold: Per-pixel difference threshold to consider a pixel as different.
:return: Boolean indicating if images are similar within the allowed percentage difference.
"""
img1 = Image.open(io.BytesIO(img1_bytes)).convert("RGB")
img2 = Image.open(io.BytesIO(img2_bytes)).convert("RGB")

if img1.size != img2.size:
print(f"Image sizes do not match: {img1.size} vs {img2.size}")
return False

diff = ImageChops.difference(img1, img2)

diff_gray = diff.convert("L")

differing_pixels = sum(
1 for pixel in diff_gray.getdata() if pixel > pixel_threshold
)
total_pixels = img1.size[0] * img1.size[1]
pct_diff = (differing_pixels / total_pixels) * 100

print(f"Percentage of differing pixels: {pct_diff:.2f}%")
return pct_diff <= max_pct_diff


def test_parse_pdf_with_images():
doc_with_image_path = Path("src/tests/sample_data/pdf-with-image.pdf")
pdf = Pdf(doc_with_image_path)

elems = ingest(pdf)
assert elems[-1].variant == NodeVariant.IMAGE
assert elems[-1].image_mimetype == "image/jpeg"
extracted_image_data = base64.b64decode(elems[-1].image)

# Read the raw image data
raw_image_path = Path("src/tests/sample_data/europe.jpg")
with raw_image_path.open("rb") as img_file:
raw_image_data = img_file.read()

assert _images_are_similar(raw_image_data, extracted_image_data)

0 comments on commit 7971f05

Please sign in to comment.