diff --git a/fslint-gui b/fslint-gui
index c0823d5..47c3719 100755
--- a/fslint-gui
+++ b/fslint-gui
@@ -735,6 +735,10 @@ class fslint(GladeWrapper):
min_size += 'c'
if min_size:
self.findParams += '-size +' + min_size + ' '
+ if self.opt_duplicate_algorithm_only_md5.get_active():
+ self.findParams += " --verify-only-md5 "
+ elif self.opt_duplicate_algorithm_md5_1M.get_active():
+ self.findParams += " --verify-only-md5-of-first-1M "
self.findParams += self.extra_find_params.get_text()
diff --git a/fslint.glade b/fslint.glade
index c2a6a04..d987336 100644
--- a/fslint.glade
+++ b/fslint.glade
@@ -791,6 +791,117 @@
True
+
+
+
+ True
+ False
+ 7
+
+
+
+ True
+ Duplicate detection
+ False
+ False
+ GTK_JUSTIFY_LEFT
+ False
+ False
+ 0.5
+ 0.5
+ 0
+ 0
+ PANGO_ELLIPSIZE_NONE
+ -1
+ False
+ 0
+
+
+ 0
+ False
+ False
+
+
+
+
+
+ True
+ False
+ 0
+
+
+
+ True
+ True
+ Safe: md5&sha1
+ True
+ GTK_RELIEF_NORMAL
+ True
+ True
+ False
+ True
+
+
+ 0
+ False
+ False
+
+
+
+
+
+ True
+ True
+ Unsafe: only md5
+ True
+ GTK_RELIEF_NORMAL
+ True
+ False
+ False
+ True
+ opt_duplicate_algorithm
+
+
+ 0
+ False
+ False
+
+
+
+
+
+ True
+ Compare files using only md5 hash of the first 1M of file.
Use with caution - results may contain false positive matches.
+ True
+ Unsafe: md5 first 1M
+ True
+ GTK_RELIEF_NORMAL
+ True
+ False
+ False
+ True
+ opt_duplicate_algorithm
+
+
+ 0
+ False
+ False
+
+
+
+
+ 0
+ True
+ True
+
+
+
+
+ 0
+ False
+ True
+
+
False
diff --git a/fslint/findup b/fslint/findup
index fe8aee8..8e197a9 100755
--- a/fslint/findup
+++ b/fslint/findup
@@ -116,6 +116,9 @@ cleanup_sum() {
'
}
+# values: md5_sha1, md5, md5_1M
+duplicate_algorithm="md5_sha1"
+
for arg
do
case "$arg" in
@@ -129,6 +132,10 @@ do
# Undocumented option to avoid extra
# hardlink merging already done in GUI
gui=1 ;;
+ --verify-only-md5)
+ duplicate_algorithm="md5" ;;
+ --verify-only-md5-of-first-1M)
+ duplicate_algorithm="md5_1M" ;;
-m)
mode="merge" ;;
-d)
@@ -190,23 +197,35 @@ else
cat
fi |
-# This block selects duplicates using md5sum of whole file
-xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
-cleanup_sum | #undo any backslash escaping
-sort | #group duplicate files together
-uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
-
-# The following optional block, checks duplicates again using sha1
-# Note for data sets that don't totally fit in cache this will
-# probably read duplicate files off the disk again.
-cut -s -d' ' -f3- | #get filenames
-sort | #sort by paths to try to minimise disk seeks
-tr '\n' '\0' | #delimit names with \0
-xargs -r0 sha1sum -- | #to be sure to be sure
-cleanup_sum | #undo any backslash escaping
-sort | #group duplicate files together
-uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
+# contents comparision according to $duplicate_algorithm
+if [ $duplicate_algorithm = "md5_1M" ]; then
+ xargs -r0 "$script_dir"/supprt/file_size_1m_md5sum |
+ cleanup_sum | #undo any backslash escaping
+ sort | #group duplicate files together
+ uniq --all-repeated=$sep_mode -w47 #pick just duplicates - 15digits size + 32 hex digits of md5
+else
+ # This block selects duplicates using md5sum of whole file
+ xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
+ cleanup_sum | #undo any backslash escaping
+ sort | #group duplicate files together
+ uniq --all-repeated=$sep_mode -w32 #pick just duplicates
+fi |
+# extra sha1 pass
+if [ $duplicate_algorithm = "md5_sha1" ]; then
+ # The following optional block, checks duplicates again using sha1
+ # Note for data sets that don't totally fit in cache this will
+ # probably read duplicate files off the disk again.
+ cut -s -d' ' -f3- | #get filenames
+ sort | #sort by paths to try to minimise disk seeks
+ tr '\n' '\0' | #delimit names with \0
+ xargs -r0 sha1sum -- | #to be sure to be sure
+ cleanup_sum | #undo any backslash escaping
+ sort | #group duplicate files together
+ uniq --all-repeated=$sep_mode -w40 #pick just duplicates
+else
+ cat
+fi |
cut -d' ' -f3- | #get filenames (and leave separating lines)
if [ "$gui" ]; then
diff --git a/fslint/supprt/file_size_1m_md5sum b/fslint/supprt/file_size_1m_md5sum
new file mode 100755
index 0000000..f248d68
--- /dev/null
+++ b/fslint/supprt/file_size_1m_md5sum
@@ -0,0 +1,12 @@
+#!/bin/bash
+# prints
+# - file size zero-padded to 15 digits
+# - 32 hex digits of md5sum of first 1M of file
+# - 2 spaces
+# - file name
+
+for fname in "$@"; do
+ fileHash=`head -c1048576 "$fname" |md5sum |cut -d' ' -f1`
+ fileSize=`stat -c %s "$fname"`
+ echo "`printf "%015d" "$fileSize"`$fileHash $fname"
+done