Skip to content

Commit

Permalink
gc.collect
Browse files Browse the repository at this point in the history
  • Loading branch information
ipitio committed Oct 9, 2024
1 parent 2df4b0b commit 6dac7df
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ services:
- ./pdf/done:/app/done
```
1. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done`
3. Run `docker compose up` to OCR the PDFs and move them into `./pdf/done`

## Quick Start

Expand Down
15 changes: 8 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
This script OCRs PDFs
"""

import gc
import os
import sys
from pathlib import Path
Expand All @@ -10,7 +15,7 @@

def process_pdfs(base: Path = Path(".")):
"""
Process all PDF files in the "todo" directory and save the OCR results in the "done" directory.
Process all PDF files in "todo" and save the results in "done".
:param base: The base directory containing the "todo" and "done" directories.
"""
Expand All @@ -24,23 +29,19 @@ def predict(input_file: Path, output_file: Path):

# Create a PDF file to store the OCR results
doc = fitz.open()
print(f"Created new PDF file: {output_file}")

# Perform OCR on the images
for image in convert_from_path(input_file, dpi=300, fmt="jpeg"):
print("Converted PDF page to image")
prediction = pytesseract.image_to_pdf_or_hocr(image, extension="pdf")
print("Extracted text from image")
doc.insert_pdf(fitz.open("pdf", prediction))
print("Inserted text into PDF")
gc.collect()

# Save the OCR results to a new PDF file
doc.save(output_file, garbage=4, deflate=True)
doc.close()
print(f"Saved OCR results to {output_file}")
input_file.unlink()
print(f"Processed {relative_path}")

# parallel:
Parallel(n_jobs=-1)(
delayed(predict)(Path(root) / file, Path(root.replace("todo", "done")) / file)
for root, _, files in os.walk(base / "todo")
Expand Down

0 comments on commit 6dac7df

Please sign in to comment.