#!/bin/bash # # This little script finds duplicate files parsing current directory # and recursively all the others inside. It can accept more than one # dir to parse as args: eg: # ./duplicate_finder.sh . # ./duplicate_finder.sh ~/my_photos ~/las_vegas_marriage /some/where/else/folder # # For duplicate i mean file with same or different names located in # same or different dirs. The parsing is first done on sizes and # then, only on matching files, on their md5 checksum. I hope that # this can speed up the parsing and definitely eliminate false # positives. # # The script does wide use of find and inodes this is to avoid # problems with strange filenames. # # Known bugs: a little problem with md5sum, it prepends a \ # (backslash) to the sum if the filename contains one or mode \, this # problem affects only the output not the functionality of the script # Workarounds for this problem are welcome. # # This script is a deep rewrite of one i've found online, written by # Jarno Elonen (thanks) # Lately I found fdupes, a C program that does the same (and more) of # this script. # # Do anything you like with this but DO NOT call me for ANY problem # :-) # # pit at asashi a dot net site # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. if [ -e ".duplic" ] ; then echo "Sorry, you should delete an old .duplic file" exit 0 else echo "#!/bin/bash" > .duplic find $@ -type f -printf "%s\n" |\ sort | uniq -d |\ while read x ; do find $@ -size "$x"c -printf "%i\t" -exec md5sum {} \; ; done |\ awk '{print $2,$1}' | sed -e s/^[\\]// |\ sort --key=1,32 | uniq -w 32 -D | awk '{print $2}' |\ while read y ; do find $@ -inum $y -printf "#find " -exec echo -n "$@" \; -printf " -inum %i -exec rm -i \"{}\" \";\" # size: %s\n#" -exec md5sum {} \; -printf "\n"; done >> .duplic chmod u+x .duplic ls -l .duplic fi exit 0 # Let me explain... # # 1- fast check for an existing .duplic file # 2- echo script header # 3- find regular files - feed sort & uniq to find those of equal size # 4- exec md5sum on this files # 5- sort by sum # 6- write instance for remove in the .duplic files # # vim:tabstop=3:expandtab:shiftwidth=3:autoindent #