This is an update to a post from 2011-03-15 - A Java program to recursively find all duplicate files in a directory.

On August 4 Kelvin Goodson emailed me a modified version which does not read the entire file to memory to make a hash. This prevents out of memory errors, but is slower. Therefore I leave both versions, with the memory-friendly version as the default, but you can switch back to cpu-friendly by executing with a ‘-quick’ command line flag:

java FindDuplicates path/to/files -quick
</tr></table></code></figure>

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116</td>
import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; /* * an amalgamation of the memory hungry "find duplicate files" program from here ... * https://jakut.is/2011/03/15/a-java-program-to-list-all/ * with the space economic hashing code found here ... * http://stackoverflow.com/questions/1741545/java-calculate-sha-256-hash-of-large-file-efficiently */ public class FindDuplicates { private static MessageDigest md; static { try { md = MessageDigest.getInstance("SHA-512"); } catch (NoSuchAlgorithmException e) { throw new RuntimeException("cannot initialize SHA-512 hash function", e); } } public static void find(Map<String, List<String>> lists, File directory, boolean leanAlgorithm) throws Exception { String hash; for (File child : directory.listFiles()) { if (child.isDirectory()) { find(lists, child, leanAlgorithm); } else { try { hash = leanAlgorithm ? makeHashLean(child) : makeHashQuick(child); List<String> list = lists.get(hash); if (list == null) { list = new LinkedList<String>(); lists.put(hash, list); } list.add(child.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException("cannot read file " + child.getAbsolutePath(), e); } } } } /* * quick but memory hungry (might like to run with java -Xmx2G or the like to increase heap space if RAM available) */ public static String makeHashQuick(File infile) throws Exception { FileInputStream fin = new FileInputStream(infile); byte data[] = new byte[(int) infile.length()]; fin.read(data); fin.close(); String hash = new BigInteger(1, md.digest(data)).toString(16); return hash; } /* * slower but memory efficient -- you might like to play with the size defined by "buffSize" */ public static String makeHashLean(File infile) throws Exception { RandomAccessFile file = new RandomAccessFile(infile, "r"); int buffSize = 16384; byte[] buffer = new byte[buffSize]; long read = 0; // calculate the hash of the whole file for the test long offset = file.length(); int unitsize; while (read < offset) { unitsize = (int) (((offset - read) >= buffSize) ? buffSize : (offset - read)); file.read(buffer, 0, unitsize); md.update(buffer, 0, unitsize); read += unitsize; } file.close(); String hash = new BigInteger(1, md.digest()).toString(16); return hash; } public static void main(String[] args) { if (args.length < 1) { System.out.println("Please supply a path to directory to find duplicate files in."); return; } File dir = new File(args[0]); if (!dir.isDirectory()) { System.out.println("Supplied directory does not exist."); return; } Map<String, List<String>> lists = new HashMap<String, List<String>>(); try { FindDuplicates.find(lists, dir, args.length == 1 || !args[1].equals("-quick")); } catch (Exception e) { e.printStackTrace(); } for (List<String> list : lists.values()) { if (list.size() > 1) { System.out.println("--"); for (String file : list) { System.out.println(file); } } } System.out.println("--"); } } </pre></div>