diff --git a/fslint-gui b/fslint-gui index c0823d5..47c3719 100755 --- a/fslint-gui +++ b/fslint-gui @@ -735,6 +735,10 @@ class fslint(GladeWrapper): min_size += 'c' if min_size: self.findParams += '-size +' + min_size + ' ' + if self.opt_duplicate_algorithm_only_md5.get_active(): + self.findParams += " --verify-only-md5 " + elif self.opt_duplicate_algorithm_md5_1M.get_active(): + self.findParams += " --verify-only-md5-of-first-1M " self.findParams += self.extra_find_params.get_text() diff --git a/fslint.glade b/fslint.glade index c2a6a04..d987336 100644 --- a/fslint.glade +++ b/fslint.glade @@ -791,6 +791,117 @@ True + + + + True + False + 7 + + + + True + Duplicate detection + False + False + GTK_JUSTIFY_LEFT + False + False + 0.5 + 0.5 + 0 + 0 + PANGO_ELLIPSIZE_NONE + -1 + False + 0 + + + 0 + False + False + + + + + + True + False + 0 + + + + True + True + Safe: md5&sha1 + True + GTK_RELIEF_NORMAL + True + True + False + True + + + 0 + False + False + + + + + + True + True + Unsafe: only md5 + True + GTK_RELIEF_NORMAL + True + False + False + True + opt_duplicate_algorithm + + + 0 + False + False + + + + + + True + Compare files using only md5 hash of the first 1M of file. Use with caution - results may contain false positive matches. + True + Unsafe: md5 first 1M + True + GTK_RELIEF_NORMAL + True + False + False + True + opt_duplicate_algorithm + + + 0 + False + False + + + + + 0 + True + True + + + + + 0 + False + True + + False diff --git a/fslint/findup b/fslint/findup index fe8aee8..8e197a9 100755 --- a/fslint/findup +++ b/fslint/findup @@ -116,6 +116,9 @@ cleanup_sum() { ' } +# values: md5_sha1, md5, md5_1M +duplicate_algorithm="md5_sha1" + for arg do case "$arg" in @@ -129,6 +132,10 @@ do # Undocumented option to avoid extra # hardlink merging already done in GUI gui=1 ;; + --verify-only-md5) + duplicate_algorithm="md5" ;; + --verify-only-md5-of-first-1M) + duplicate_algorithm="md5_1M" ;; -m) mode="merge" ;; -d) @@ -190,23 +197,35 @@ else cat fi | -# This block selects duplicates using md5sum of whole file -xargs -r0 md5sum -- | #calculate md5sums for possible duplicates -cleanup_sum | #undo any backslash escaping -sort | #group duplicate files together -uniq --all-repeated=$sep_mode -w32 | #pick just duplicates - -# The following optional block, checks duplicates again using sha1 -# Note for data sets that don't totally fit in cache this will -# probably read duplicate files off the disk again. -cut -s -d' ' -f3- | #get filenames -sort | #sort by paths to try to minimise disk seeks -tr '\n' '\0' | #delimit names with \0 -xargs -r0 sha1sum -- | #to be sure to be sure -cleanup_sum | #undo any backslash escaping -sort | #group duplicate files together -uniq --all-repeated=$sep_mode -w40 | #pick just duplicates +# contents comparision according to $duplicate_algorithm +if [ $duplicate_algorithm = "md5_1M" ]; then + xargs -r0 "$script_dir"/supprt/file_size_1m_md5sum | + cleanup_sum | #undo any backslash escaping + sort | #group duplicate files together + uniq --all-repeated=$sep_mode -w47 #pick just duplicates - 15digits size + 32 hex digits of md5 +else + # This block selects duplicates using md5sum of whole file + xargs -r0 md5sum -- | #calculate md5sums for possible duplicates + cleanup_sum | #undo any backslash escaping + sort | #group duplicate files together + uniq --all-repeated=$sep_mode -w32 #pick just duplicates +fi | +# extra sha1 pass +if [ $duplicate_algorithm = "md5_sha1" ]; then + # The following optional block, checks duplicates again using sha1 + # Note for data sets that don't totally fit in cache this will + # probably read duplicate files off the disk again. + cut -s -d' ' -f3- | #get filenames + sort | #sort by paths to try to minimise disk seeks + tr '\n' '\0' | #delimit names with \0 + xargs -r0 sha1sum -- | #to be sure to be sure + cleanup_sum | #undo any backslash escaping + sort | #group duplicate files together + uniq --all-repeated=$sep_mode -w40 #pick just duplicates +else + cat +fi | cut -d' ' -f3- | #get filenames (and leave separating lines) if [ "$gui" ]; then diff --git a/fslint/supprt/file_size_1m_md5sum b/fslint/supprt/file_size_1m_md5sum new file mode 100755 index 0000000..f248d68 --- /dev/null +++ b/fslint/supprt/file_size_1m_md5sum @@ -0,0 +1,12 @@ +#!/bin/bash +# prints +# - file size zero-padded to 15 digits +# - 32 hex digits of md5sum of first 1M of file +# - 2 spaces +# - file name + +for fname in "$@"; do + fileHash=`head -c1048576 "$fname" |md5sum |cut -d' ' -f1` + fileSize=`stat -c %s "$fname"` + echo "`printf "%015d" "$fileSize"`$fileHash $fname" +done