diff --git a/pyproject.toml b/pyproject.toml index d07028e..96019ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's." readme = "README.md" requires-python = ">=3.8" license = { text = "MIT" } -version = "0.5.7" +version = "0.5.8" authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] dependencies = [ "PyMuPDF >= 1.23.2", diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index 5f4fd22..2dbc0d9 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -8,6 +8,7 @@ from pdfminer.high_level import extract_pages from pdfminer.layout import LTPage +from PIL import Image from pydantic import BaseModel from pypdf import PdfReader, PdfWriter @@ -237,3 +238,23 @@ def _flip_coordinates(self, bbox: Bbox) -> Bbox: x1=bbox.x1, y1=fy1, ) + + def to_imgs(self, page_numbers: Optional[List[int]] = None) -> List[Image.Image]: + doc = self.to_pymupdf_doc() + images = [] + + if not doc.is_pdf: + raise ValueError("The document is not in PDF format.") + if doc.needs_pass: + raise ValueError("The PDF document is password protected.") + + if page_numbers is None: + page_numbers = list(range(doc.page_count)) + + for n in page_numbers: + page = doc[n] + pix = page.get_pixmap() + image = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) + images.append(image) + + return images diff --git a/src/openparse/version.py b/src/openparse/version.py index 5f6394d..0c3e4bb 100644 --- a/src/openparse/version.py +++ b/src/openparse/version.py @@ -1,4 +1,4 @@ -OPEN_PARSE_VERSION = "0.5.7" +OPEN_PARSE_VERSION = "0.5.8" def version_info() -> str: diff --git a/src/tests/conftest.py b/src/tests/conftest.py new file mode 100644 index 0000000..3a2b4a5 --- /dev/null +++ b/src/tests/conftest.py @@ -0,0 +1,16 @@ +from pathlib import Path + +import pytest + + +@pytest.hookimpl(tryfirst=True) +def pytest_sessionstart(session): + expected_dir = Path(__file__).resolve().parent / "src" + + current_dir = Path.cwd() + + if not current_dir == expected_dir: + pytest.exit( + "Pytest must be run from the project root directory", + returncode=1, + ) diff --git a/src/tests/test_doc_parser.py b/src/tests/test_doc_parser.py index ddb126a..efa6636 100644 --- a/src/tests/test_doc_parser.py +++ b/src/tests/test_doc_parser.py @@ -1,6 +1,7 @@ -import openparse import re +import openparse + def test_parse_doc(): basic_doc_path = "src/evals/data/full-pdfs/mock-1-page-lease.pdf"