From 6dac7df3401cc30939da83adba20a7a95b7e0bb4 Mon Sep 17 00:00:00 2001 From: ipitio <21136719+ipitio@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:27:20 -0400 Subject: [PATCH] gc.collect --- README.md | 2 +- src/main.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c08b31b..4aba500 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ services: - ./pdf/done:/app/done ``` -1. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done` +3. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done` ## Quick Start diff --git a/src/main.py b/src/main.py index 933f21e..fab4dd1 100644 --- a/src/main.py +++ b/src/main.py @@ -1,3 +1,8 @@ +""" +This script OCRs PDFs +""" + +import gc import os import sys from pathlib import Path @@ -10,7 +15,7 @@ def process_pdfs(base: Path = Path(".")): """ - Process all PDF files in the "todo" directory and save the OCR results in the "done" directory. + Process all PDF files in "todo" and save the results in "done". :param base: The base directory containing the "todo" and "done" directories. """ @@ -24,23 +29,19 @@ def predict(input_file: Path, output_file: Path): # Create a PDF file to store the OCR results doc = fitz.open() - print(f"Created new PDF file: {output_file}") + # Perform OCR on the images for image in convert_from_path(input_file, dpi=300, fmt="jpeg"): - print("Converted PDF page to image") prediction = pytesseract.image_to_pdf_or_hocr(image, extension="pdf") - print("Extracted text from image") doc.insert_pdf(fitz.open("pdf", prediction)) - print("Inserted text into PDF") + gc.collect() # Save the OCR results to a new PDF file doc.save(output_file, garbage=4, deflate=True) doc.close() - print(f"Saved OCR results to {output_file}") input_file.unlink() print(f"Processed {relative_path}") - # parallel: Parallel(n_jobs=-1)( delayed(predict)(Path(root) / file, Path(root.replace("todo", "done")) / file) for root, _, files in os.walk(base / "todo")