• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ljacqu / FileDuplicateFinder / 5661257559

25 Jul 2023 08:04PM UTC coverage: 23.055% (-0.4%) from 23.453%
5661257559

push

github

ljacqu
Merge remote-tracking branch 'origin/master'

111 of 610 branches covered (18.2%)

Branch coverage included in aggregate %.

378 of 1511 relevant lines covered (25.02%)

1.28 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.15
/src/main/java/ch/jalu/fileduplicatefinder/duplicatefinder/FileDuplicateFinder.java
1
package ch.jalu.fileduplicatefinder.duplicatefinder;
2

3
import ch.jalu.fileduplicatefinder.config.FileUtilConfiguration;
4
import ch.jalu.fileduplicatefinder.filefilter.FilePathMatcher;
5
import ch.jalu.fileduplicatefinder.hashing.FileHasher;
6
import ch.jalu.fileduplicatefinder.output.TaskWriterReader;
7
import ch.jalu.fileduplicatefinder.utils.PathUtils;
8
import com.google.common.collect.ArrayListMultimap;
9
import com.google.common.collect.HashMultimap;
10
import com.google.common.collect.ListMultimap;
11
import com.google.common.collect.Multimap;
12
import com.google.common.collect.Multimaps;
13
import com.google.common.io.MoreFiles;
14

15
import java.io.IOException;
16
import java.io.InputStream;
17
import java.io.UncheckedIOException;
18
import java.nio.file.Files;
19
import java.nio.file.Path;
20
import java.util.ArrayList;
21
import java.util.Arrays;
22
import java.util.Collections;
23
import java.util.Comparator;
24
import java.util.HashMap;
25
import java.util.List;
26
import java.util.Map;
27
import java.util.function.Function;
28
import java.util.stream.Collectors;
29
import java.util.stream.Stream;
30

31
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_HASH_MAX_SIZE_MB;
32
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_OUTPUT_DIFFERENCE_READ_FILES_VS_HASH;
33
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_OUTPUT_PROGRESS_FILES_FOUND_INTERVAL;
34
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_OUTPUT_PROGRESS_FILES_HASHED_INTERVAL;
35
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_READ_BEFORE_HASH_BYTES_TO_READ;
36
import static ch.jalu.fileduplicatefinder.config.FileUtilSettings.DUPLICATE_READ_BEFORE_HASH_MIN_SIZE;
37
import static ch.jalu.fileduplicatefinder.utils.FileSizeUtils.megaBytesToBytes;
38

39
public class FileDuplicateFinder {
40

41
    private final Path rootFolder;
42
    private final FileHasher fileHasher;
43
    private final FilePathMatcher pathMatcher;
44
    private final FileUtilConfiguration configuration;
45
    private final TaskWriterReader logger;
46

47
    private final int progressFilesFound;
48
    private final int progressFilesHashed;
49

50
    private final Map<Long, List<Path>> pathsBySize = new HashMap<>();
5✔
51
    private int count;
52
    private int hashingSaveCount;
53

54
    public FileDuplicateFinder(Path rootFolder, FileHasher fileHasher, FilePathMatcher pathMatcher,
55
                               FileUtilConfiguration configuration, TaskWriterReader logger) {
2✔
56
        this.rootFolder = rootFolder;
3✔
57
        this.fileHasher = fileHasher;
3✔
58
        this.pathMatcher = pathMatcher;
3✔
59
        this.configuration = configuration;
3✔
60
        this.logger = logger;
3✔
61

62
        this.progressFilesFound = configuration.getValue(DUPLICATE_OUTPUT_PROGRESS_FILES_FOUND_INTERVAL);
7✔
63
        this.progressFilesHashed = configuration.getValue(DUPLICATE_OUTPUT_PROGRESS_FILES_HASHED_INTERVAL);
7✔
64
    }
1✔
65

66
    public void processFiles() {
67
        processPath(rootFolder);
4✔
68
        logger.printLn("Found total " + count + " files");
6✔
69
    }
1✔
70

71
    private void processPath(Path path) {
72
        if (pathMatcher.shouldScan(path)) {
5✔
73
            if (Files.isDirectory(path)) {
5✔
74
                PathUtils.list(path).forEach(this::processPath);
6✔
75
            } else if (Files.isRegularFile(path)) {
5!
76
                long fileSize = PathUtils.size(path);
3✔
77
                List<Path> paths = pathsBySize.computeIfAbsent(fileSize, s -> new ArrayList<>(5));
13✔
78
                paths.add(path);
4✔
79

80
                if ((++count & progressFilesFound) == 0) {
11!
81
                    logger.printLn("Found " + count + " files");
×
82
                }
83
            }
84
        }
85
    }
1✔
86

87
    public List<DuplicateEntry> filterFilesForDuplicates() {
88
        logger.printLn("");
4✔
89
        logger.print("Hashing files");
4✔
90
        List<DuplicateEntry> duplicateEntries = pathsBySize.entrySet().stream()
5✔
91
            .filter(entry -> entry.getValue().size() > 1)
12✔
92
            .flatMap(hashEntriesInFileSizeAndReturnDuplicates())
2✔
93
            .sorted(createDuplicateEntryComparator())
2✔
94
            .collect(Collectors.toList());
4✔
95
        if (configuration.getValue(DUPLICATE_OUTPUT_DIFFERENCE_READ_FILES_VS_HASH)) {
7!
96
            logger.printLn("Skipped hashing " + hashingSaveCount + " files from reading bytes before hashing");
×
97
        }
98
        return duplicateEntries;
2✔
99
    }
100

101
    public Map<Integer, Long> getSizeDistribution() {
102
        return pathsBySize.values().stream()
×
103
            .collect(Collectors.groupingBy(List::size, Collectors.counting()));
×
104
    }
105

106
    private Function<Map.Entry<Long, List<Path>>, Stream<DuplicateEntry>> hashEntriesInFileSizeAndReturnDuplicates() {
107
        final int[] hashedFiles = {0};
7✔
108
        return entry -> {
4✔
109
            Runnable progressUpdater = () -> {
4✔
110
                if ((++hashedFiles[0] & progressFilesHashed) == 0) {
12!
111
                    logger.printNewLine();
×
112
                    logger.print("Hashed " + hashedFiles[0] + " files");
×
113
                } else if ((hashedFiles[0] & 15) == 0) {
6!
114
                    logger.printWithoutPrefix(" . ");
×
115
                }
116
            };
1✔
117

118
            return hashFilesAndReturnDuplicates(entry.getKey(), entry.getValue(), progressUpdater);
11✔
119
        };
120
    }
121

122
    private static Comparator<DuplicateEntry> createDuplicateEntryComparator() {
123
        Comparator<DuplicateEntry> comparatorByNumberOfFilesAsc = Comparator.comparing(e -> e.getPaths().size());
3✔
124
        return Comparator.comparing(DuplicateEntry::getSize).reversed()
5✔
125
            .thenComparing(comparatorByNumberOfFilesAsc.reversed());
2✔
126
    }
127

128
    private Stream<DuplicateEntry> hashFilesAndReturnDuplicates(long fileSize, List<Path> paths,
129
                                                                Runnable progressUpdater) {
130
        double maxSizeMegabytes = configuration.getValue(DUPLICATE_HASH_MAX_SIZE_MB);
7✔
131
        if (maxSizeMegabytes > 0 && fileSize >= megaBytesToBytes(maxSizeMegabytes)) {
9!
132
            return Stream.of(new DuplicateEntry(fileSize, "Size " + fileSize, paths));
9✔
133
        }
134

135
        ListMultimap<String, Path> pathsByHash = ArrayListMultimap.create(paths.size(), 2);
5✔
136
        for (Path path : getPathsToHash(paths, fileSize)) {
13✔
137
            try {
138
                pathsByHash.put(fileHasher.calculateHash(path), path);
8✔
139
                progressUpdater.run();
2✔
140
            } catch (IOException e) {
×
141
                throw new UncheckedIOException(path.toAbsolutePath().toString(), e);
×
142
            }
1✔
143
        }
1✔
144

145
        return Multimaps.asMap(pathsByHash).entrySet().stream()
6✔
146
            .filter(e -> e.getValue().size() > 1)
13✔
147
            .map(e -> new DuplicateEntry(fileSize, e.getKey(), e.getValue()));
12✔
148
    }
149

150
    private List<Path> getPathsToHash(List<Path> paths, long filesize) {
151
        if (filesize >= megaBytesToBytes(configuration.getValue(DUPLICATE_READ_BEFORE_HASH_MIN_SIZE))) {
10✔
152
            Multimap<WrappedByteArray, Path> files = HashMultimap.create(paths.size(), 2);
5✔
153
            for (Path path : paths) {
10✔
154
                byte[] bytes = new byte[configuration.getValue(DUPLICATE_READ_BEFORE_HASH_BYTES_TO_READ)];
8✔
155
                try (InputStream is = MoreFiles.asByteSource(path).openBufferedStream()) {
6✔
156
                    is.read(bytes);
4✔
157
                    files.put(new WrappedByteArray(bytes), path);
8✔
158
                } catch (IOException e) {
×
159
                    throw new UncheckedIOException("Could not read '" + path.toAbsolutePath() + "'", e);
×
160
                }
1✔
161
            }
1✔
162
            List<Path> filteredPaths = files.asMap().entrySet().stream()
6✔
163
                .filter(e -> e.getValue().size() > 1 && pathMatcher.hasFileFromResultWhitelist(e.getValue()))
19!
164
                .flatMap(e -> e.getValue().stream())
6✔
165
                .collect(Collectors.toList());
4✔
166
            if (configuration.getValue(DUPLICATE_OUTPUT_DIFFERENCE_READ_FILES_VS_HASH)) {
7!
167
                logger.printLn(paths.size() + " -> " + filteredPaths.size() + " ");
×
168
                hashingSaveCount += paths.size() - filteredPaths.size();
×
169
            }
170
            return filteredPaths;
2✔
171
        } else {
172
            return pathMatcher.hasFileFromResultWhitelist(paths) ? paths : Collections.emptyList();
8!
173
        }
174
    }
175

176
    private static class WrappedByteArray {
177
        private final byte[] value;
178

179
        WrappedByteArray(byte[] value) {
2✔
180
            this.value = value;
3✔
181
        }
1✔
182

183
        @Override
184
        public boolean equals(Object object) {
185
            if (object == this) {
3!
186
                return true;
×
187
            } else if (object instanceof WrappedByteArray) {
3!
188
                return Arrays.equals(value, ((WrappedByteArray) object).value);
7✔
189
            }
190
            return false;
×
191
        }
192

193
        @Override
194
        public int hashCode() {
195
            return Arrays.hashCode(value);
4✔
196
        }
197
    }
198
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc