Skip to content

Commit

Permalink
rename files when sending to s3 and match correctly when updating cocina
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed Jan 7, 2025
1 parent 9c09f87 commit 3df5ff2
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 12 deletions.
31 changes: 22 additions & 9 deletions lib/dor/text_extraction/cocina_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ module TextExtraction
# 2. If the file will overwrite an existing file in Cocina ensure that
# the file to be overwritten was generated by SDR and has not been
# corrected.
# 3. If the file already exists in the Cocina update it by first deleting
# 3. If the file already exists in the Cocina, update it by first deleting
# from the Cocina and then adding it again.
# 4. If the file does not exist in the Cocina already look find a resource
# that contains the same filename "stem" and add the file there. e.g.
# "abc123.xml" would be added to the same resource that contains
# "abc123.tiff".
#
# 4. If the file does not exist in the Cocina already, look find a resource
# that contains the same filename "stem" or the same filename with extension added to the end of the stem
# and add the file there.
# e.g. "abc123.xml" would be added to the same resource that contains "abc123.tiff"
# and "abc123_tiff.xml" would be added to the same resource that contains "abc123.tiff"
# see https://github.com/sul-dlss/common-accessioning/issues/1443 for the logic behind this
# rubocop:disable Metrics/ClassLength
class CocinaUpdater
attr_reader :dro, :logger
Expand Down Expand Up @@ -149,14 +150,17 @@ def find_resource(path)
end
end

# Find a Cocina resource for the given path by looking for a resource that has a matching filename "stem".
# Find a Cocina resource for the given path by looking for a resource that has a matching filename "stem"
# OR a matching filename with an extension added to the end of the stem,
# e.g. "file1_tiff.xml" would be added to the same resource that contains "file1.tiff"
# see https://github.com/sul-dlss/common-accessioning/issues/1443
# @param path {String} - the path to look for
# @return {FileSet, nil} - the resource or nil if it is not found
def find_resource_with_stem(path)
file_stem = stem(path)
structural.contains.detect do |resource|
resource.structural.contains.detect do |file|
stem(file.filename) == file_stem
stem(file.filename) == file_stem || file.filename == extracted_filename_with_extension(path)
end
end
end
Expand Down Expand Up @@ -240,8 +244,17 @@ def resource_type(path)
end
end

# get the stem of a file path removing the extension (e.g. '/user/path/file1.xml' => 'file1')
def stem(path)
File.basename(path, '.*').to_s
File.basename(path, File.extname(path))
end

# get the full filename of a file path by extracting the extension from the last part of the filename (e.g. '/user/path/file1_mp4.vtt' => 'file1.mp4')
def extracted_filename_with_extension(path)
basename_with_extension_stem = stem(path)
extension_from_filename = basename_with_extension_stem.include?('_') ? basename_with_extension_stem.split('_', 2).last : ''
basename_without_extension_stem = basename_with_extension_stem.gsub("_#{extension_from_filename}", '')
"#{basename_without_extension_stem}.#{extension_from_filename}"
end

def file_identifier
Expand Down
12 changes: 11 additions & 1 deletion lib/dor/text_extraction/speech_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,18 @@ def filenames_to_stt
end

# return the s3 location for a given filename
# NOTE: Due to https://github.com/sul-dlss/common-accessioning/issues/1443, we will rename this file when sending it to whisper
# to ensure whisper will always produce a unique output caption file for each input file provided (even if the basename is identical).
# This is because whisper currently only uses the basename to determine the output filename, which will cause a problem if we have two
# input files with the same basename but different extensions (e.g. "file.mp4" and "file.m4a").
def s3_location(filename)
File.join(job_id, filename)
file_extension = File.extname(filename)
basename = File.basename(filename, file_extension)
# For example, "file1.mp4" becomes "file1_mp4.mp4" which will cause whipser to produce "file1_mp4.vtt" as the output,
# and "file1.m4a" becomes "file1_m4a.m4a" which will cause whisper to produce "file1_m4a.vtt" as the output, ensuring uniqueness for both.
# The cocina_updater will understand this and update the cocina structural metadata correctly matching the new filename to the original.
new_filename = "#{basename}_#{file_extension.delete('.')}#{file_extension}"
File.join(job_id, new_filename)
end

# return the job_id for the stt job, defined as the druid-version of the object
Expand Down
4 changes: 2 additions & 2 deletions spec/lib/dor/text_extraction/speech_to_text_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@
describe '#s3_location' do
let(:version) { 3 }

it 'returns the s3 filename key for a given filename' do
expect(stt.s3_location('text.xml')).to eq("#{bare_druid}-v#{version}/text.xml")
it 'returns the new s3 filename key for a given filename' do
expect(stt.s3_location('text.xml')).to eq("#{bare_druid}-v#{version}/text_xml.xml")
end
end

Expand Down
77 changes: 77 additions & 0 deletions spec/robots/dor_repo/speech_to_text/update_cocina_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@

it 'runs the update cocina robot and sets the caption role' do
new_cocina = test_perform(robot, druid)
expect(new_cocina.structural.contains.size).to eq 1
new_json_file = new_cocina.structural.contains[0].structural.contains[1]
new_vtt_file = new_cocina.structural.contains[0].structural.contains[2]
expect(new_json_file.filename).to eq 'file1.json'
Expand All @@ -71,6 +72,82 @@
end
end

context 'with vtt files and input files with the same filename but different extensions' do
let(:structural) do
{
contains: [
{
type: Cocina::Models::FileSetType.file,
externalIdentifier: "#{bare_druid}_1",
label: 'Audio 1',
version: 1,
structural: {
contains: [
{
type: Cocina::Models::ObjectType.file,
externalIdentifier: "#{druid}_1",
label: 'file1.m4a',
filename: 'file1.m4a',
size: 123,
version: 1,
hasMimeType: 'audio/mp4'
}
]
}
},
{
type: Cocina::Models::FileSetType.file,
externalIdentifier: "#{bare_druid}_1",
label: 'Video 1',
version: 1,
structural: {
contains: [
{
type: Cocina::Models::ObjectType.file,
externalIdentifier: "#{druid}_1",
label: 'file1.mp4',
filename: 'file1.mp4',
size: 123,
version: 1,
hasMimeType: 'video/mp4'
}
]
}
}
]
}
end

# setup fake caption vtt and json files in the workspace directory which matches the names of the audio and video files in the Cocina
before do
create_speech_to_text_file('file1_m4a.vtt')
create_speech_to_text_file('file1_m4a.json')
create_speech_to_text_file('file1_mp4.vtt')
create_speech_to_text_file('file1_mp4.json')
end

# rubocop:disable RSpec/ExampleLength
it 'runs the update cocina robot and correctly adds the renamed output files to the correct resource' do
new_cocina = test_perform(robot, druid)
new_audio_json_file = new_cocina.structural.contains[0].structural.contains[1]
new_audio_vtt_file = new_cocina.structural.contains[0].structural.contains[2]
new_video_json_file = new_cocina.structural.contains[1].structural.contains[1]
new_video_vtt_file = new_cocina.structural.contains[1].structural.contains[2]
expect(new_cocina.structural.contains.size).to eq 2
expect(new_cocina.structural.contains[0].structural.contains[0].filename).to eq 'file1.m4a'
expect(new_audio_json_file.filename).to eq 'file1_m4a.json'
expect(new_audio_json_file.use).to be_nil
expect(new_audio_vtt_file.filename).to eq 'file1_m4a.vtt'
expect(new_audio_vtt_file.use).to eq 'caption'
expect(new_cocina.structural.contains[1].structural.contains[0].filename).to eq 'file1.mp4'
expect(new_video_json_file.filename).to eq 'file1_mp4.json'
expect(new_video_json_file.use).to be_nil
expect(new_video_vtt_file.filename).to eq 'file1_mp4.vtt'
expect(new_video_vtt_file.use).to eq 'caption'
end
# rubocop:enable RSpec/ExampleLength
end

context 'with a txt file' do
# setup a fake caption txt and json file in the workspace directory which matches the name of the audio file in the Cocina
before do
Expand Down

0 comments on commit 3df5ff2

Please sign in to comment.