-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmp3.dupes
executable file
·54 lines (42 loc) · 2.76 KB
/
mp3.dupes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
########################################################################################################################################
# Script to search MP3 (and hypotetically othe media formats) files duplicates ignoring any metadata like idv3 tags ##
# Posted to: ##
# https://superuser.com/questions/248495/how-to-compare-mp3-flac-audio-data-in-a-file-ignoring-header-data-id3-tag-et/1219353#1219353 ##
# https://superuser.com/questions/347141/identify-differences-between-mp3-files/1219360#1219360 ##
########################################################################################################################################
########## Algorithm:
# 1)
#diff Original.mp3 Possible-dup.mp3 ; echo $?
#>Binary files Original.mp3 and Possible-dup.mp3 differ
#>2
# 2)
#$ diff <( ffmpeg -loglevel 8 -i Original.mp3 -map_metadata -1 -f wav - ) <( ffmpeg -loglevel 8 -i Possible-dup.mp3 -map_metadata -1 -f wav - ) ; echo $?
#0
# 3)
#for file in *.mp3; do printf "%s:%s\n" "$( ffmpeg -loglevel 8 -i "$file" -map_metadata -1 -f wav - | sha256sum | cut -d' ' -f1 )" "$file"; done > mp3data.hashes
#find -L orig-dir dir-with-duplicates -name '*.mp3' -print0 | while read -r -d $'\0' file; do printf "%s:%s\n" "$( ffmpeg -loglevel 8 -i \"$file\" -map_metadata -1 -f wav - | sha256sum | cut -d' ' -f1 )" "$file"; done > mp3data.hashes
# 4) Analyze.
# Only dupes:
#count.by.regexp.awk '([0-9a-f]+):' mp3data.hashes
#>[1:54320b708cea0771a8cf71fac24196a070836376dd83eedd619f247c2ece7480]=1
#>[1:1d8627a21bdbf74cc5c7bc9451f7db264c167f7df4cbad7d8db80bc2f347110f]=2
#>[1:ad48913a11de29ad4639253f2f06d8480b73d48a5f1d0aaa24271c0ba3998d02]=1
# 5) And files-duplicates list:
# grep mp3data.hashes -f <( count.by.regexp.awk '([0-9a-f]+):' mp3data.hashes | grep -oP '(?<=\[1:).{64}(?!]=1$)' )
##########/ Algorithm
# All in one:
: ${1?"Not enough arguments: `basename $0` dir-to-scan"}
#set -x
HASHES=$(
# find/while by: https://stackoverflow.com/questions/8677546/reading-null-delimited-strings-through-a-bash-loop/8677566#8677566
find -L "${1}" -name '*.mp3' -print0 | \
while IFS= read -r -d $'\0' _file; do
printf "%s:%s\n" "$( ffmpeg -loglevel 8 -i "${_file}" -map_metadata -1 -f wav - </dev/null | sha256sum | cut -d' ' -f1 )" "${_file}";
done
)
echo "$HASHES" | grep -f <( count.by.regexp.awk '([0-9a-f]+):' <<< "$HASHES" | grep -oP '(?<=\[1:).{64}(?!]=1$)' ) | sort -u
# WARNING. That is more proof of concept than real life solution! That may be improved in several ways like:
# 1) Adding pv to show progress
# 2) Using some RDBMS (e.g. sqlite) to made calculations more clear and easy
# 3) Enhance output...