From b0493732c475899f60585ad441e9eb401c50a333 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Sun, 16 Jul 2023 10:46:40 -0400 Subject: [PATCH] Replace Wand with pypdfium2 for page.to_image(...) This commit swaps out Wand (and its non-Python dependencies ImageMagick and Ghostscript) for pypdfium2 for PageImage rendering. This has some advantages: - Less finicky: Wand often caused users problems, due to "MagickWand shared library not found" and "PolicyError: not authorized `PDF'" issues. By contrast, pypdfium2 seems (at least at first) to more self-contained and not require any system-tweaking. - Faster: pypdfium2 appears to render images more quickly than Wand (see @cmdlineuser's tests in #899) - More flexible: pypdfium2 appears to generate images with greater color depth; by default, pdfplumber quantizes those images so that they save/display compactly (in fact, with smaller file sizes than the previous code), this commit also adds parameters to retain all/more of the original, more detailed colors. Thanks to @cmdlineuser in #899 for the suggestion. --- README.md | 13 ++--- pdfplumber/display.py | 121 +++++++++++++++++++++++------------------- pdfplumber/pdf.py | 5 +- requirements.txt | 2 +- tests/test_display.py | 21 +++++--- 5 files changed, 87 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index dbf4cf43..4fcc47bb 100644 --- a/README.md +++ b/README.md @@ -250,17 +250,12 @@ If you pass the `pdfminer.six`-handling `laparams` parameter to `pdfplumber.open `pdfplumber`'s visual debugging tools can be helpful in understanding the structure of a PDF and the objects that have been extracted from it. -__Note:__ To use this feature, you'll also need to have two additional pieces of software installed on your computer: - -- [`ImageMagick`](https://www.imagemagick.org/). [Installation instructions here](http://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-debian). -- [`ghostscript`](https://www.ghostscript.com). [Installation instructions here](https://ghostscript.readthedocs.io/en/latest/Install.html), or simply `apt install ghostscript` (Ubuntu) / `brew install ghostscript` (Mac). - ### Creating a `PageImage` with `.to_image()` To turn any page (including cropped pages) into an `PageImage` object, call `my_page.to_image()`. You can optionally pass *one* of the following keyword arguments: -- `resolution`: The desired number pixels per inch. Defaults to 72. See note below. +- `resolution`: The desired number pixels per inch. Defaults to 72. - `width`: The desired image width in pixels. - `height`: The desired image width in pixels. @@ -270,12 +265,10 @@ For instance: im = my_pdf.pages[0].to_image(resolution=150) ``` -From a script or REPL, `im.show()` will open the image in your local image viewer. But `PageImage` objects also play nicely with IPython/Jupyter notebooks; they automatically render as cell outputs. For example: +From a script or REPL, `im.show()` will open the image in your local image viewer. But `PageImage` objects also play nicely with Jupyter notebooks; they automatically render as cell outputs. For example: ![Visual debugging in Jupyter](examples/screenshots/visual-debugging-in-jupyter.png "Visual debugging in Jupyter") -*Note*: `pdfplumber` passes the `resolution` parameter to [Wand](https://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image), the Python library we use for image conversion. Wand will create the image with the desired number of total pixels of height/width, but does not fully respect the `resolution` in the strict sense of that word: Although PNGs are capable of storing an image's resolution density as metadata, Wand's PNGs do not. - *Note*: `.to_image(...)` works as expected with `Page.crop(...)`/`CroppedPage` instances, but is unable to incorporate changes made via `Page.filter(...)`/`FilteredPage` instances. @@ -286,7 +279,7 @@ From a script or REPL, `im.show()` will open the image in your local image viewe |`im.reset()`| Clears anything you've drawn so far.| |`im.copy()`| Copies the image to a new `PageImage` object.| |`im.show()`| Opens the image in your local image viewer.| -|`im.save(path_or_fileobject, format="PNG")`| Saves the annotated image.| +|`im.save(path_or_fileobject, format="PNG", quantize=True, colors=256, bits=8)`| Saves the annotated image as a PNG file. The default arguments quantize the image to a palette of 256 colors, saving the PNG with 8-bit color depth. You can disable quantization by passing `quantize=False` or adjust the size of the color palette by passing `colors=N`.| ### Drawing methods diff --git a/pdfplumber/display.py b/pdfplumber/display.py index 2284ce62..b5cd9acf 100644 --- a/pdfplumber/display.py +++ b/pdfplumber/display.py @@ -1,10 +1,10 @@ from io import BufferedReader, BytesIO -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from pathlib import Path +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union import PIL.Image import PIL.ImageDraw -from wand.image import Color as WandColor # type: ignore -from wand.image import Image as WandImage +import pypdfium2 # type: ignore from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq @@ -34,66 +34,53 @@ class COLORS: def get_page_image( - stream: Union[BufferedReader, BytesIO], page_no: int, resolution: Union[int, float] -) -> WandImage: + stream: Union[BufferedReader, BytesIO], + page_ix: int, + resolution: Union[int, float], + password: Optional[str], +) -> PIL.Image.Image: # If we are working with a file object saved to disk if hasattr(stream, "name"): - filename = f"{stream.name}[{page_no}]" - file = None - - def postprocess(img: WandImage) -> WandImage: - return img + src = stream.name # If we instead are working with a BytesIO stream else: stream.seek(0) - filename = None - file = stream - - def postprocess(img: WandImage) -> WandImage: - return WandImage(image=img.sequence[page_no]) - - with WandImage( - resolution=resolution, - filename=filename, - file=file, - colorspace="rgb", - format="pdf", - ) as img_init: - img = postprocess(img_init) - with WandImage( - width=img.width, - height=img.height, - background=WandColor("white"), - colorspace="rgb", - ) as bg: - bg.composite(img, 0, 0) - try: - im = PIL.Image.open(BytesIO(bg.make_blob("png"))) - except PIL.Image.DecompressionBombError: - raise PIL.Image.DecompressionBombError( - "Image conversion raised a DecompressionBombError. " - "PIL.Image.MAX_IMAGE_PIXELS is currently set to " - f"{PIL.Image.MAX_IMAGE_PIXELS}. " - "If you trust this PDF, you can try setting " - "PIL.Image.MAX_IMAGE_PIXELS to a higher value. " - "See https://github.com/jsvine/pdfplumber/issues/413" - "#issuecomment-1190650404 for more information." - ) - return im.convert("RGB") + src = stream + + img: PIL.Image.Image = pypdfium2.PdfDocument._process_page( + # Modifiable arguments + page_ix, + input_data=src, + password=password, + scale=resolution / 72, + no_smoothtext=True, + # Non-modifiable arguments + renderer=pypdfium2._helpers.page.PdfPage.render, + converter=pypdfium2.PdfBitmap.to_pil, + prefer_bgrx=True, + pass_info=False, + need_formenv=False, + mk_formconfig=None, + ) + + return img.convert("RGB") class PageImage: def __init__( self, page: "Page", - original: Optional[WandImage] = None, + original: Optional[PIL.Image.Image] = None, resolution: Union[int, float] = DEFAULT_RESOLUTION, ): self.page = page if original is None: self.original = get_page_image( - page.pdf.stream, page.page_number - 1, resolution + stream=page.pdf.stream, + page_ix=page.page_number - 1, + resolution=resolution, + password=page.pdf.password, ) else: self.original = original @@ -104,15 +91,18 @@ def __init__( else: self.root = page.root_page cropped = page.root_page.bbox != page.bbox + + self.resolution = resolution self.scale = self.original.size[0] / self.root.width + if cropped: cropbox = ( - (page.bbox[0] - page.root_page.bbox[0]) * self.scale, - (page.bbox[1] - page.root_page.bbox[1]) * self.scale, - (page.bbox[2] - page.root_page.bbox[0]) * self.scale, - (page.bbox[3] - page.root_page.bbox[1]) * self.scale, + int((page.bbox[0] - page.root_page.bbox[0]) * self.scale), + int((page.bbox[1] - page.root_page.bbox[1]) * self.scale), + int((page.bbox[2] - page.root_page.bbox[0]) * self.scale), + int((page.bbox[3] - page.root_page.bbox[1]) * self.scale), ) - self.original = self.original.crop(tuple(map(int, cropbox))) + self.original = self.original.crop(cropbox) self.reset() def _reproject_bbox(self, bbox: T_bbox) -> T_bbox: @@ -134,12 +124,35 @@ def _reproject(self, coord: T_point) -> T_point: return (_x0, _top) def reset(self) -> "PageImage": - self.annotated = PIL.Image.new(self.original.mode, self.original.size) + self.annotated = PIL.Image.new("RGB", self.original.size) self.annotated.paste(self.original) self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA") - self.save = self.annotated.save return self + def save( + self, + dest: Union[str, Path, BytesIO], + format: str = "PNG", + quantize: bool = True, + colors: int = 256, + bits: int = 8, + **kwargs: Any, + ) -> None: + if quantize: + out = self.annotated.quantize(colors, method=PIL.Image.FASTOCTREE).convert( + "P" + ) + else: + out = self.annotated + + out.save( + dest, + format=format, + bits=bits, + dpi=(self.resolution, self.resolution), + **kwargs, + ) + def copy(self) -> "PageImage": return self.__class__(self.page, self.original) @@ -358,7 +371,7 @@ def outline_chars( def _repr_png_(self) -> bytes: b = BytesIO() - self.annotated.save(b, "PNG") + self.save(b, "PNG") return b.getvalue() def show(self) -> None: # pragma: no cover diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 3e152574..d4e95766 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -29,15 +29,16 @@ def __init__( stream_is_external: bool = False, pages: Optional[Union[List[int], Tuple[int]]] = None, laparams: Optional[Dict[str, Any]] = None, - password: str = "", + password: Optional[str] = None, strict_metadata: bool = False, ): self.stream = stream self.stream_is_external = stream_is_external self.pages_to_parse = pages self.laparams = None if laparams is None else LAParams(**laparams) + self.password = password - self.doc = PDFDocument(PDFParser(stream), password=password) + self.doc = PDFDocument(PDFParser(stream), password=password or "") self.rsrcmgr = PDFResourceManager() self.metadata = {} diff --git a/requirements.txt b/requirements.txt index d8cd3239..45f7c2b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ pdfminer.six==20221105 Pillow>=9.1 -Wand>=0.6.10 +pypdfium2>=4.18.0 diff --git a/tests/test_display.py b/tests/test_display.py index 9316015a..5c17261d 100644 --- a/tests/test_display.py +++ b/tests/test_display.py @@ -90,16 +90,21 @@ def test_outline_chars(self): def test__repr_png_(self): png = self.im._repr_png_() assert isinstance(png, bytes) - assert len(png) in ( - 61247, - 71939, - 71983, - 72168, - ) # PNG encoder seems to work differently on different setups + assert 40000 < len(png) < 80000 + + def test_no_quantize(self): + b = io.BytesIO() + self.im.save(b, "PNG", quantize=False) + assert len(b.getvalue()) > 100000 def test_decompression_bomb(self): original_max = PIL.Image.MAX_IMAGE_PIXELS PIL.Image.MAX_IMAGE_PIXELS = 10 - with pytest.raises(PIL.Image.DecompressionBombError): - self.pdf.pages[0].to_image() + # Previously, this raised PIL.Image.DecompressionBombError + self.pdf.pages[0].to_image() PIL.Image.MAX_IMAGE_PIXELS = original_max + + def test_password(self): + path = os.path.join(HERE, "pdfs/password-example.pdf") + with pdfplumber.open(path, password="test") as pdf: + pdf.pages[0].to_image()