Skip to content

Commit

Permalink
added pdf.to_imgs func
Browse files Browse the repository at this point in the history
  • Loading branch information
Filimoa committed Sep 23, 2024
1 parent 984ed68 commit f5bd782
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's."
readme = "README.md"
requires-python = ">=3.8"
license = { text = "MIT" }
version = "0.5.7"
version = "0.5.8"
authors = [{name = "Sergey Filimonov", email = "[email protected]"}]
dependencies = [
"PyMuPDF >= 1.23.2",
Expand Down
21 changes: 21 additions & 0 deletions src/openparse/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage
from PIL import Image
from pydantic import BaseModel
from pypdf import PdfReader, PdfWriter

Expand Down Expand Up @@ -237,3 +238,23 @@ def _flip_coordinates(self, bbox: Bbox) -> Bbox:
x1=bbox.x1,
y1=fy1,
)

def to_imgs(self, page_numbers: Optional[List[int]] = None) -> List[Image.Image]:
doc = self.to_pymupdf_doc()
images = []

if not doc.is_pdf:
raise ValueError("The document is not in PDF format.")
if doc.needs_pass:
raise ValueError("The PDF document is password protected.")

if page_numbers is None:
page_numbers = list(range(doc.page_count))

for n in page_numbers:
page = doc[n]
pix = page.get_pixmap()
image = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
images.append(image)

return images
2 changes: 1 addition & 1 deletion src/openparse/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
OPEN_PARSE_VERSION = "0.5.7"
OPEN_PARSE_VERSION = "0.5.8"


def version_info() -> str:
Expand Down
16 changes: 16 additions & 0 deletions src/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pathlib import Path

import pytest


@pytest.hookimpl(tryfirst=True)
def pytest_sessionstart(session):
expected_dir = Path(__file__).resolve().parent / "src"

current_dir = Path.cwd()

if not current_dir == expected_dir:
pytest.exit(
"Pytest must be run from the project root directory",
returncode=1,
)
3 changes: 2 additions & 1 deletion src/tests/test_doc_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import openparse
import re

import openparse


def test_parse_doc():
basic_doc_path = "src/evals/data/full-pdfs/mock-1-page-lease.pdf"
Expand Down

0 comments on commit f5bd782

Please sign in to comment.