pdfocr.rb

#!/usr/bin/env ruby

# Copyright (c) 2010 Geza Kovacs
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

require 'fileutils'
require 'optparse'
require 'tmpdir'

def shell_escape(str)
  "'" + str.gsub("'", "'\\''") + "'"
end

def sh(cmd, *args)
  outl = []

  unless args.empty?
    cmd = shell_escape(cmd) + ' '
    cmd << args.map { |w| shell_escape(w) }.join(' ')
  end

  IO.popen(cmd) do |f|
    until f.eof?
      tval = f.gets
      puts tval
      outl.push(tval)
    end
  end

  outl.join('')
end

def writef(filename, text)
  File.open(filename, 'w') do |f|
    f.puts(text)
  end
end

app_name = 'pdfocr'
version = [0, 1, 4]
infile = nil
outfile = nil
delete_dir = true
delete_files = true
language = 'eng'
check_lang = false
tmp = nil
use_ocropus = false
use_cuneiform = false
use_tesseract = false
run_unpaper = false
outdpi = '300' # Use string, since this is input to commands anyway

optparse = OptionParser.new do |opts|
  opts.banner = <<~USAGE
    Usage: #{app_name} -i input.pdf -o output.pdf
    #{app_name} adds text to PDF files using the ocropus, cuneiform, or tesseract OCR software
  USAGE

  opts.on('-i', '--input [FILE]', 'Specify input PDF file') do |fn|
    infile = fn
  end

  opts.on('-o', '--output [FILE]', 'Specify output PDF file') do |fn|
    outfile = fn
  end

  opts.on('-u', '--unpaper', 'Run unpaper on each page before OCR.') do
    run_unpaper = true
  end

  opts.on('-t', '--tesseract', 'Use tesseract as the OCR engine (default)') do
    use_tesseract = true
  end

  opts.on('-c', '--cuneiform', 'Use cuneiform as the OCR engine') do
    use_cuneiform = true
  end

  opts.on('-p', '--ocropus', 'Use ocropus as the OCR engine') do
    use_ocropus = true
  end

  opts.on('-l', '--lang [LANG]', 'Specify language for the OCR software') do |fn|
    language = fn
    check_lang = true
  end

  opts.on('-L', '--nocheck-lang LANG', 'Suppress checking of language parameter') do |fn|
    language = fn
    check_lang = false
  end

  opts.on('-w', '--workingdir [DIR]', 'Specify directory to store temp files in') do |fn|
    delete_dir = false
    tmp = fn
  end

  opts.on('-k', '--keep', 'Keep temporary files around') do
    delete_files = false
  end

  opts.on_tail('-h', '--help', 'Show this message') do
    puts opts
    exit
  end

  opts.on_tail('-v', '--version', 'Show version') do
    puts version.join('.')
    exit
  end

  opts.on('--dpi DPI', 'Set OCR and output resolution in DPI. Useful to reduce PDF size') do |fn|
    outdpi = fn
  end
end

optparse.parse!(ARGV)

if !infile || infile == ''
  puts optparse
  puts
  puts 'Need to specify an input PDF file'
  exit
end

if infile[-3..-1].casecmp('pdf') != 0
  puts "Input PDF file #{infile} should have a PDF extension"
  exit
end

# baseinfile = infile[0..-5]

# if not baseinfile or baseinfile == ''
#   puts "Input file #{infile} needs to have a name, not just an extension"
#   exit
# end

unless File.file?(infile)
  puts "Input file #{infile} does not exist"
  exit
end

infile = File.expand_path(infile)

if !outfile || outfile == ''
  puts optparse
  puts
  puts 'Need to specify an output PDF file'
  exit
end

if outfile[-3..-1].casecmp('pdf') != 0
  puts 'Output PDF file should have a PDF extension'
  exit
end

if outfile == infile
  puts 'Output PDF file should not be the same as the input PDF file'
  exit
end

if File.file?(outfile)
  puts "Output file #{outfile} already exists"
  exit
end

outfile = File.expand_path(outfile)

if !language || language == ''
  puts 'Need to specify a language'
  exit
end

if `which pdftk` == ''
  puts 'pdftk command is missing. Install the pdftk package'
  exit
end

if `which pdftoppm` == ''
  puts 'pdftoppm command is missing. Install the poppler-utils package'
  exit
end

if use_ocropus
  if `which ocroscript` == ''
    puts 'The ocroscript command is missing. Install the ocropus package.'
    exit
  end
elsif use_cuneiform
  if `which cuneiform` == ''
    puts 'The cuneiform command is missing. Install the cuneiform package.'
    exit
  end
elsif use_tesseract
  if `which tesseract` == ''
    puts 'The tesseract command is missing. Install the tesseract-ocr package and the'
    puts 'language packages you need, e.g. tesseract-ocr-deu, tesseract-ocr-deu-frak,'
    puts 'or tesseract-ocr-eng.'
    exit
  end
else
  if `which tesseract` != ''
    use_tesseract = true
  elsif `which cuneiform` != ''
    use_cuneiform = true
  elsif `which ocroscript` != ''
    use_ocropus = true
  else
    puts 'The tesseract command is missing. Install the tesseract-ocr package and the'
    puts 'language packages you need, e.g. tesseract-ocr-deu, tesseract-ocr-deu-frak,'
    puts 'or tesseract-ocr-eng.'
    exit
  end
end

if `which hocr2pdf` == ''
  puts 'hocr2pdf command is missing. Install the exactimage package'
  exit
end

if run_unpaper
  if `which unpaper` == ''
    puts 'The unpaper command is missing. Install the unpaper package.'
    exit
  end
end

if delete_dir
  tmp = Dir.mktmpdir
elsif File.directory?(tmp)
  tmp = "#{File.expand_path(tmp)}/pdfocr"
  if File.directory?(tmp)
    puts "Directory #{tmp} already exists - remove it"
    exit
  else
    Dir.mkdir(tmp)
  end
else
  puts "Working directory #{tmp} does not exist"
  exit
end

if check_lang
  langlist = []
  if use_cuneiform
    begin
      langlist = `cuneiform -l`.split("\n")[-1].split(':')[-1].delete('.').split(' ')
    rescue
      puts 'Unable to list supported languages from cuneiform'
    end
  end
  if use_tesseract
    begin
      langlist = `tesseract --list-langs 2>&1`.split("\n")[1..-1]
    rescue
      puts 'Unable to list supported languages from tesseract'
    end
  end
  if langlist && !langlist.empty?
    unless langlist.include?(language)
      puts "Language #{language} is not supported or not installed. Please choose from"
      puts langlist.join(' ')
      exit
    end
  end
end

puts "Input file is #{infile}"
puts "Output file is #{outfile}"
puts "Using working dir #{tmp}"

puts 'Getting info from PDF file'

puts

pdfinfo = sh 'pdftk', infile, 'dump_data'

if !pdfinfo || pdfinfo == ''
  puts "Error: didn't get info from pdftk #{infile} dump_data"
  exit
end

puts

begin
  pdfinfo =~ /NumberOfPages: (\d+)/
  pagenum = Regexp.last_match(1).to_i
rescue
  puts "Error: didn't get page count for #{infile} from pdftk"
  exit
end

if pagenum.zero?
  puts "Error: there are 0 pages in the input PDF file #{infile}"
  exit
end

writef("#{tmp}/pdfinfo.txt", pdfinfo)

puts "Converting #{pagenum} pages"

numdigits = pagenum.to_s.length

Dir.chdir("#{tmp}/") do
  1.upto(pagenum) do |i|
    puts '=========='
    puts "Extracting page #{i}"
    basefn = i.to_s.rjust(numdigits, '0')
    sh 'pdftk', infile, 'cat', i.to_s, 'output', "#{basefn}.pdf"
    unless File.file?("#{basefn}.pdf")
      puts "Error while extracting page #{i}"
      next
    end
    puts "Converting page #{i} to ppm"

    sh "pdftoppm -cropbox -r #{outdpi} #{shell_escape(basefn)}.pdf >#{shell_escape(basefn)}.ppm"
    unless File.file?("#{basefn}.ppm")
      puts "Error while converting page #{i} to ppm"
      next
    end

    if run_unpaper
      puts "Running unpaper on page #{i}"
      sh 'unpaper', "#{basefn}.ppm", "#{basefn}_unpaper.ppm"
      unless File.file?("#{basefn}_unpaper.ppm")
        puts "Error while running unpaper on page #{i}"
        next
      end
      sh 'mv', "#{basefn}_unpaper.ppm", "#{basefn}.ppm"
    end

    puts "Running OCR on page #{i}"
    if use_cuneiform
      sh 'cuneiform', '-l', language, '-f', 'hocr', '-o', "#{basefn}.hocr", "#{basefn}.ppm"
    elsif use_tesseract
      sh 'tesseract', '--dpi', outdpi, '-l', language, "#{basefn}.ppm", "#{basefn}-new", 'pdf'
      unless File.file?("#{basefn}-new.pdf")
        puts "Error while running OCR on page #{i}"
        puts "Input page will be added to output without OCR."
        sh 'mv', "#{basefn}.pdf", "#{basefn}-new.pdf"
      end
    else
      sh "ocroscript recognize #{shell_escape(basefn)}.ppm > #{shell_escape(basefn)}.hocr"
    end

    next if use_tesseract

    unless File.file?("#{basefn}-new.pdf")
      puts "Error while running OCR on page #{i}"
      next
    end
  end
end

puts 'Merging together PDF files'
sh "pdftk \"#{tmp}/\"*-new.pdf cat output \"#{tmp}/merged.pdf\""

puts "Updating PDF info for #{outfile}"

sh 'pdftk', "#{tmp}/merged.pdf", 'update_info', "#{tmp}/pdfinfo.txt", 'output', outfile

if delete_files
  puts 'Cleaning up temporary files'
  FileUtils.rmtree tmp
end