Skip to content

Commit

Permalink
add spec, clarify code matching in comments
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed Jan 15, 2025
1 parent a2a2d03 commit 819d143
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 3 deletions.
15 changes: 12 additions & 3 deletions lib/dor/text_extraction/cocina_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,12 @@ def find_resource(path)
end
end

# Find a Cocina resource for the given path by looking for a resource that has a matching filename "stem"
# Find a Cocina resource for the given file path of new files in the workspace by
# looking for a resource that has a matching filename "stem"
# OR a matching filename with an extension added to the end of the stem,
# e.g. "file1_tiff.xml" would be added to the same resource that contains "file1.tiff"
# e.g. "file1.xml" would be added to the same resource as "file1.tiff"
# OR "file1_tiff.xml" would be added to the same resource that contains "file1.tiff" if a resource with "file1.xml" does not exist
# Nothing is done if neither the stem nor the stem with extension is found in the Cocina
# see https://github.com/sul-dlss/common-accessioning/issues/1443
# @param path {String} - the path to look for
# @return {FileSet, nil} - the resource or nil if it is not found
Expand Down Expand Up @@ -250,9 +253,15 @@ def stem(path)
end

# get the full filename of a file path by extracting the extension from the last part of the filename (e.g. '/user/path/file1_mp4.vtt' => 'file1.mp4')
# note that this assumes that the extension is the last part of the filename after the last underscore even if it is not a valid extension
# while this could theoretically cause us to match the file to the wrong resource, it is unlikely to happen in practice since we are
# (1) first looking for exact filename matches and (2) only iterating and adding files in the workspace that were generated by OCR or speech to text
def extracted_filename_with_extension(path)
basename_with_extension_stem = stem(path)
extension_from_filename = basename_with_extension_stem.include?('_') ? basename_with_extension_stem.split('_').last : ''

return nil unless basename_with_extension_stem.include?('_')

extension_from_filename = basename_with_extension_stem.split('_').last
basename_without_extension_stem = basename_with_extension_stem.gsub("_#{extension_from_filename}", '')
"#{basename_without_extension_stem}.#{extension_from_filename}"
end
Expand Down
82 changes: 82 additions & 0 deletions spec/lib/dor/text_extraction/cocina_updater_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe Dor::TextExtraction::CocinaUpdater do
subject(:updater) { described_class.new(dro:, logger:) }

let(:druid) { 'druid:bc123df4567' }
let(:object_type) { Cocina::Models::ObjectType.image }
let(:dro) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, dro?: true, version: 1, type: object_type, structural:) }
let(:logger) { instance_double(Logger) }
let(:path) { "/some/server/path/#{filename}" }
let(:tiff_file) { instance_double(Cocina::Models::File, filename: 'file1.tiff') }
let(:other_tiff_file) { instance_double(Cocina::Models::File, filename: 'file2.tiff') }
let(:tiff_fileset) { instance_double(Cocina::Models::FileSet, structural: tiff_fileset_structural) }
let(:other_tiff_fileset) { instance_double(Cocina::Models::FileSet, structural: other_tiff_fileset_structural) }
let(:tiff_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [tiff_file]) }
let(:other_tiff_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [other_tiff_file]) }
let(:structural) { instance_double(Cocina::Models::DROStructural, contains: [tiff_fileset, other_tiff_fileset]) }

describe '#find_resource_with_stem' do
context 'when file has matching stem' do
let(:filename) { 'file1.xml' }

it 'finds resource with matching stem' do
expect(updater.send(:find_resource_with_stem, path)).to eq tiff_fileset
end
end

context 'when file has extension in stem' do
let(:filename) { 'file1_tiff.xml' }

it 'finds resource with matching filename' do
expect(updater.send(:find_resource_with_stem, path)).to eq tiff_fileset
end
end

context 'when no matching stem exists' do
let(:filename) { 'file3.xml' }

it 'returns nil' do
expect(updater.send(:find_resource_with_stem, path)).to be_nil
end
end
end

describe '#extracted_filename_with_extension' do
context 'when no underscores exist' do
let(:filename) { 'file1.vtt' }

it 'returns nil' do
expect(updater.send(:extracted_filename_with_extension, path)).to be_nil
end
end

context 'when the file extension is included in the filename' do
let(:filename) { 'file1_tiff.vtt' }

it 'extracts filename with original extension' do
expect(updater.send(:extracted_filename_with_extension, path)).to eq 'file1.tiff'
end
end

context 'when the file extension is included in the filename and there are other underscors in the filename too' do
let(:filename) { 'file1_cool_file_tiff.vtt' }

it 'extracts filename with original extension' do
expect(updater.send(:extracted_filename_with_extension, path)).to eq 'file1_cool_file.tiff'
end
end

context 'when there are underscores in the filename but none are the extension' do
let(:filename) { 'file1_cool_file.vtt' }

# NOTE: this will not throw an exception, and in theory could end up inadventantely matching a file in a resource
# that it should not, but this is a rare edge case ... it'll probably not match anything and be skipped
it 'extracts a filename with something it thinks is an extension, but it actually is not' do
expect(updater.send(:extracted_filename_with_extension, path)).to eq 'file1_cool.file'
end
end
end
end

0 comments on commit 819d143

Please sign in to comment.