Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions fslint-gui
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,10 @@ class fslint(GladeWrapper):
min_size += 'c'
if min_size:
self.findParams += '-size +' + min_size + ' '
if self.opt_duplicate_algorithm_only_md5.get_active():
self.findParams += " --verify-only-md5 "
elif self.opt_duplicate_algorithm_md5_1M.get_active():
self.findParams += " --verify-only-md5-of-first-1M "

self.findParams += self.extra_find_params.get_text()

Expand Down
111 changes: 111 additions & 0 deletions fslint.glade
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,117 @@
<property name="fill">True</property>
</packing>
</child>

<child>
<widget class="GtkVBox" id="vbox11">
<property name="visible">True</property>
<property name="homogeneous">False</property>
<property name="spacing">7</property>

<child>
<widget class="GtkLabel" id="label_opt_duplicate_algorithm">
<property name="visible">True</property>
<property name="label" translatable="yes">Duplicate detection</property>
<property name="use_underline">False</property>
<property name="use_markup">False</property>
<property name="justify">GTK_JUSTIFY_LEFT</property>
<property name="wrap">False</property>
<property name="selectable">False</property>
<property name="xalign">0.5</property>
<property name="yalign">0.5</property>
<property name="xpad">0</property>
<property name="ypad">0</property>
<property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
<property name="width_chars">-1</property>
<property name="single_line_mode">False</property>
<property name="angle">0</property>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">False</property>
<property name="fill">False</property>
</packing>
</child>

<child>
<widget class="GtkVBox" id="vbox12">
<property name="visible">True</property>
<property name="homogeneous">False</property>
<property name="spacing">0</property>

<child>
<widget class="GtkRadioButton" id="opt_duplicate_algorithm">
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="label" translatable="yes">Safe: md5&amp;sha1</property>
<property name="use_underline">True</property>
<property name="relief">GTK_RELIEF_NORMAL</property>
<property name="focus_on_click">True</property>
<property name="active">True</property>
<property name="inconsistent">False</property>
<property name="draw_indicator">True</property>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">False</property>
<property name="fill">False</property>
</packing>
</child>

<child>
<widget class="GtkRadioButton" id="opt_duplicate_algorithm_only_md5">
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="label" translatable="yes">Unsafe: only md5</property>
<property name="use_underline">True</property>
<property name="relief">GTK_RELIEF_NORMAL</property>
<property name="focus_on_click">True</property>
<property name="active">False</property>
<property name="inconsistent">False</property>
<property name="draw_indicator">True</property>
<property name="group">opt_duplicate_algorithm</property>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">False</property>
<property name="fill">False</property>
</packing>
</child>

<child>
<widget class="GtkRadioButton" id="opt_duplicate_algorithm_md5_1M">
<property name="visible">True</property>
<property name="tooltip" translatable="yes">Compare files using only md5 hash of the first 1M of file.&#xA;Use with caution - results may contain false positive matches.</property>
<property name="can_focus">True</property>
<property name="label" translatable="yes">Unsafe: md5 first 1M</property>
<property name="use_underline">True</property>
<property name="relief">GTK_RELIEF_NORMAL</property>
<property name="focus_on_click">True</property>
<property name="active">False</property>
<property name="inconsistent">False</property>
<property name="draw_indicator">True</property>
<property name="group">opt_duplicate_algorithm</property>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">False</property>
<property name="fill">False</property>
</packing>
</child>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">True</property>
<property name="fill">True</property>
</packing>
</child>
</widget>
<packing>
<property name="padding">0</property>
<property name="expand">False</property>
<property name="fill">True</property>
</packing>
</child>
</widget>
<packing>
<property name="tab_expand">False</property>
Expand Down
51 changes: 35 additions & 16 deletions fslint/findup
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ cleanup_sum() {
'
}

# values: md5_sha1, md5, md5_1M
duplicate_algorithm="md5_sha1"

for arg
do
case "$arg" in
Expand All @@ -129,6 +132,10 @@ do
# Undocumented option to avoid extra
# hardlink merging already done in GUI
gui=1 ;;
--verify-only-md5)
duplicate_algorithm="md5" ;;
--verify-only-md5-of-first-1M)
duplicate_algorithm="md5_1M" ;;
-m)
mode="merge" ;;
-d)
Expand Down Expand Up @@ -190,23 +197,35 @@ else
cat
fi |

# This block selects duplicates using md5sum of whole file
xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w32 | #pick just duplicates

# The following optional block, checks duplicates again using sha1
# Note for data sets that don't totally fit in cache this will
# probably read duplicate files off the disk again.
cut -s -d' ' -f3- | #get filenames
sort | #sort by paths to try to minimise disk seeks
tr '\n' '\0' | #delimit names with \0
xargs -r0 sha1sum -- | #to be sure to be sure
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
# contents comparision according to $duplicate_algorithm
if [ $duplicate_algorithm = "md5_1M" ]; then
xargs -r0 "$script_dir"/supprt/file_size_1m_md5sum |
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w47 #pick just duplicates - 15digits size + 32 hex digits of md5
else
# This block selects duplicates using md5sum of whole file
xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w32 #pick just duplicates
fi |

# extra sha1 pass
if [ $duplicate_algorithm = "md5_sha1" ]; then
# The following optional block, checks duplicates again using sha1
# Note for data sets that don't totally fit in cache this will
# probably read duplicate files off the disk again.
cut -s -d' ' -f3- | #get filenames
sort | #sort by paths to try to minimise disk seeks
tr '\n' '\0' | #delimit names with \0
xargs -r0 sha1sum -- | #to be sure to be sure
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w40 #pick just duplicates
else
cat
fi |
cut -d' ' -f3- | #get filenames (and leave separating lines)

if [ "$gui" ]; then
Expand Down
12 changes: 12 additions & 0 deletions fslint/supprt/file_size_1m_md5sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
# prints
# - file size zero-padded to 15 digits
# - 32 hex digits of md5sum of first 1M of file
# - 2 spaces
# - file name

for fname in "$@"; do
fileHash=`head -c1048576 "$fname" |md5sum |cut -d' ' -f1`
fileSize=`stat -c %s "$fname"`
echo "`printf "%015d" "$fileSize"`$fileHash $fname"
done