From 6dac7df3401cc30939da83adba20a7a95b7e0bb4 Mon Sep 17 00:00:00 2001
From: ipitio <21136719+ipitio@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:27:20 -0400
Subject: [PATCH] gc.collect

---
 README.md   |  2 +-
 src/main.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index c08b31b..4aba500 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ services:
       - ./pdf/done:/app/done
 ```
 
-1. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done`
+3. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done`
 
 ## Quick Start
 
diff --git a/src/main.py b/src/main.py
index 933f21e..fab4dd1 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,3 +1,8 @@
+"""
+This script OCRs PDFs
+"""
+
+import gc
 import os
 import sys
 from pathlib import Path
@@ -10,7 +15,7 @@
 
 def process_pdfs(base: Path = Path(".")):
     """
-    Process all PDF files in the "todo" directory and save the OCR results in the "done" directory.
+    Process all PDF files in "todo" and save the results in "done".
 
     :param base: The base directory containing the "todo" and "done" directories.
     """
@@ -24,23 +29,19 @@ def predict(input_file: Path, output_file: Path):
 
         # Create a PDF file to store the OCR results
         doc = fitz.open()
-        print(f"Created new PDF file: {output_file}")
+
         # Perform OCR on the images
         for image in convert_from_path(input_file, dpi=300, fmt="jpeg"):
-            print("Converted PDF page to image")
             prediction = pytesseract.image_to_pdf_or_hocr(image, extension="pdf")
-            print("Extracted text from image")
             doc.insert_pdf(fitz.open("pdf", prediction))
-            print("Inserted text into PDF")
+            gc.collect()
 
         # Save the OCR results to a new PDF file
         doc.save(output_file, garbage=4, deflate=True)
         doc.close()
-        print(f"Saved OCR results to {output_file}")
         input_file.unlink()
         print(f"Processed {relative_path}")
 
-    # parallel:
     Parallel(n_jobs=-1)(
         delayed(predict)(Path(root) / file, Path(root.replace("todo", "done")) / file)
         for root, _, files in os.walk(base / "todo")