• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pkiraly / metadata-qa-marc / #1527

22 Aug 2025 02:21PM UTC coverage: 90.345%. Remained the same
#1527

push

pkiraly
Improve timeline handling

5191 of 6416 new or added lines in 219 files covered. (80.91%)

886 existing lines in 78 files now uncovered.

36717 of 40641 relevant lines covered (90.34%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.33
/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java
1
package de.gwdg.metadataqa.marc.cli;
2

3
import de.gwdg.metadataqa.marc.Utils;
4
import de.gwdg.metadataqa.marc.analysis.contextual.classification.ClassificationAnalyzer;
5
import de.gwdg.metadataqa.marc.analysis.contextual.classification.ClassificationStatistics;
6
import de.gwdg.metadataqa.marc.analysis.contextual.classification.Marc21ClassificationAnalyzer;
7
import de.gwdg.metadataqa.marc.analysis.contextual.classification.PicaClassificationAnalyzer;
8
import de.gwdg.metadataqa.marc.analysis.contextual.classification.UnimarcClassificationAnalyzer;
9
import de.gwdg.metadataqa.marc.cli.parameters.ClassificationParameters;
10
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
11
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
12
import de.gwdg.metadataqa.marc.cli.utils.Collocation;
13
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
14
import de.gwdg.metadataqa.marc.cli.utils.Schema;
15
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
16
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
17
import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
18
import org.apache.commons.cli.Options;
19
import org.apache.commons.cli.ParseException;
20
import org.apache.commons.io.FileUtils;
21
import org.apache.commons.lang3.StringUtils;
22
import org.marc4j.marc.Record;
23

24
import java.io.BufferedWriter;
25
import java.io.File;
26
import java.io.IOException;
27
import java.io.Serializable;
28
import java.nio.file.Files;
29
import java.nio.file.Path;
30
import java.nio.file.Paths;
31
import java.util.Arrays;
32
import java.util.List;
33
import java.util.Map;
34
import java.util.logging.Level;
35
import java.util.logging.Logger;
36

37
import static de.gwdg.metadataqa.marc.Utils.createRow;
38

39
public class ClassificationAnalysis extends QACli<ClassificationParameters> implements BibliographicInputProcessor, Serializable {
40

41
  private static final Logger logger = Logger.getLogger(ClassificationAnalysis.class.getCanonicalName());
1✔
42

43
  private final Options options;
44
  private boolean readyToProcess;
45
  private static char separator = ',';
1✔
46
  private File collectorFile;
47
  ClassificationStatistics statistics = new ClassificationStatistics();
1✔
48

49
  public ClassificationAnalysis(String[] args) throws ParseException {
1✔
50
    parameters = new ClassificationParameters(args);
1✔
51
    options = parameters.getOptions();
1✔
52
    readyToProcess = true;
1✔
53
    Schema.resetIdCounter();
1✔
54
  }
1✔
55

56
  public static void main(String[] args) {
57
    BibliographicInputProcessor processor = null;
1✔
58
    try {
59
      processor = new ClassificationAnalysis(args);
1✔
UNCOV
60
    } catch (ParseException e) {
×
NEW
61
      logger.severe(createRow("ERROR. ", e.getLocalizedMessage()));
×
62
      System.exit(1);
×
63
    }
1✔
64
    if (processor.getParameters().getArgs().length < 1) {
1✔
NEW
65
      logger.severe("Please provide a MARC file name!");
×
UNCOV
66
      processor.printHelp(processor.getParameters().getOptions());
×
67
      System.exit(0);
×
68
    }
69
    if (processor.getParameters().doHelp()) {
1✔
UNCOV
70
      processor.printHelp(processor.getParameters().getOptions());
×
UNCOV
71
      System.exit(0);
×
72
    }
73
    RecordIterator iterator = new RecordIterator(processor);
1✔
74
    iterator.setProcessWithErrors(processor.getParameters().getProcessRecordsWithoutId());
1✔
75
    iterator.start();
1✔
76
  }
1✔
77

78
  @Override
79
  public CommonParameters getParameters() {
80
    return parameters;
1✔
81
  }
82

83
  @Override
84
  public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
85
    // do nothing
86
  }
1✔
87

88
  @Override
89
  public void processRecord(BibliographicRecord bibliographicRecord, int recordNumber, List<ValidationError> errors) throws IOException {
NEW
90
    processRecord(bibliographicRecord, recordNumber);
×
UNCOV
91
  }
×
92

93
  @Override
94
  public void processRecord(BibliographicRecord bibliographicRecord, int recordNumber) throws IOException {
95
    if (parameters.getRecordIgnorator().isIgnorable(bibliographicRecord))
1✔
UNCOV
96
      return;
×
97

98
    ClassificationAnalyzer analyzer;
99
    if (parameters.isPica()) {
1✔
100
      analyzer = new PicaClassificationAnalyzer(bibliographicRecord, statistics, parameters);
1✔
101
    } else if (parameters.isUnimarc()) {
1✔
102
      analyzer = new UnimarcClassificationAnalyzer(bibliographicRecord, statistics, parameters);
1✔
103
    } else {
104
      analyzer = new Marc21ClassificationAnalyzer(bibliographicRecord, statistics, parameters);
1✔
105
    }
106

107
    analyzer.process();
1✔
108

109
    /*
110
    List<Schema> schemas = analyzer.getSchemasInRecord();
111
    if (!schemas.isEmpty()) {
112
      List<String> abbreviations = schemas
113
        .stream()
114
        .map(Schema::getAbbreviation)
115
        .distinct()
116
        .collect(Collectors.toList());
117
      if (!abbreviations.isEmpty()) {
118
        String joined = StringUtils.join(abbreviations, ":");
119
        printToFile(collectorFile, Utils.createRow(marcRecord.getId(true), joined));
120
      }
121
    }
122
    */
123
  }
1✔
124

125
  @Override
126
  public void beforeIteration() {
127
    // Method not used
128
  }
1✔
129

130
  @Override
131
  public void fileOpened(Path path) {
132
    // Method not used
133
  }
1✔
134

135
  @Override
136
  public void fileProcessed() {
137
    // Method not used
UNCOV
138
  }
×
139

140
  @Override
141
  public void afterIteration(int numberOfprocessedRecords, long duration) {
142
    printClassificationsBySchema();
1✔
143
    printClassificationsByRecords();
1✔
144
    printClassificationsHistogram();
1✔
145
    printFrequencyExamples();
1✔
146
    printSchemaSubfieldsStatistics();
1✔
147
    if (parameters.isDoCollectCollocations())
1✔
148
      printClassificationsCollocation();
1✔
149
    copySchemaFileToOutputDir();
1✔
150
    saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
1✔
151
  }
1✔
152

153
  private void copySchemaFileToOutputDir() {
154
    if (parameters.isPica()) {
1✔
155
      File source = new File(PicaSubjectManager.getSchemaFile());
1✔
156
      try {
157
        FileUtils.copyFileToDirectory(source, new File(parameters.getOutputDir()));
1✔
UNCOV
158
      } catch (IOException e) {
×
UNCOV
159
        logger.warning(e.getLocalizedMessage());
×
160
      }
1✔
161
    }
162
  }
1✔
163

164
  private void printClassificationsCollocation() {
165
    Path path;
166
    path = Paths.get(parameters.getOutputDir(), "classifications-collocations.csv");
1✔
167
    try (var writer = Files.newBufferedWriter(path)) {
1✔
168
      writer.write(Collocation.header());
1✔
169
      Integer total1 = statistics.getHasClassifications().getOrDefault(true, Integer.valueOf(0));
1✔
170
      Integer total = statistics.recordCountWithClassification();
1✔
171
      logger.info(() -> "total: " + total);
1✔
172
      if (!total1.equals(total))
1✔
UNCOV
173
        logger.log(Level.SEVERE, "total from hasClassifications ({0}) != from collation ({1})", new Object[]{total1, total});
×
174

175
      statistics.getCollocationHistogram()
1✔
176
        .entrySet()
1✔
177
        .stream()
1✔
178
        .map(e -> new Collocation(e.getKey(), e.getValue(), total))
1✔
179
        .sorted((e1, e2) -> e1.compareTo(e2) * -1)
1✔
180
        .forEach(entry -> printCollocation(writer, entry));
1✔
UNCOV
181
    } catch (IOException e) {
×
UNCOV
182
      logger.log(Level.SEVERE, "printClassificationsCollocation", e);
×
183
    }
1✔
184
  }
1✔
185

186
  private void printCollocation(BufferedWriter writer, Collocation entry) {
187
    try {
188
      writer.write(entry.formatRow());
1✔
UNCOV
189
    } catch (IOException e) {
×
UNCOV
190
      logger.log(Level.SEVERE, "printCollocation", e);
×
191
    }
1✔
192
  }
1✔
193

194
  private void printClassificationsBySchema() {
195
    Path path;
196
    path = Paths.get(parameters.getOutputDir(), "classifications-by-schema.csv");
1✔
197
    try (var writer = Files.newBufferedWriter(path)) {
1✔
198
      writer.write(createRow("id", "field", "location", "scheme",
1✔
199
        "abbreviation", "abbreviation4solr", "recordcount", "instancecount",
200
        "type"
201
      ));
202
      statistics.getInstances()
1✔
203
        .entrySet()
1✔
204
        .stream()
1✔
205
        .sorted((e1, e2) -> {
1✔
206
            int i = e1.getKey().getField().compareTo(e2.getKey().getField());
1✔
207
            if (i != 0)
1✔
208
              return i;
1✔
209
            else {
210
              i = e1.getKey().getLocation().compareTo(e2.getKey().getLocation());
1✔
211
              if (i != 0)
1✔
UNCOV
212
                return i;
×
213
              else
214
                return e2.getValue().compareTo(e1.getValue());
1✔
215
            }
216
          }
217
        )
218
        .forEach(
1✔
219
          entry -> printSingleClassificationBySchema(writer, entry)
1✔
220
        );
UNCOV
221
    } catch (IOException e) {
×
UNCOV
222
      logger.log(Level.SEVERE, "printClassificationsBySchema", e);
×
223
    }
1✔
224
  }
1✔
225

226
  private void printSingleClassificationBySchema(BufferedWriter writer,
227
                                                 Map.Entry<Schema, Integer> entry) {
228
    Schema schema = entry.getKey();
1✔
229
    int instanceCount = entry.getValue();
1✔
230
    int recordCount = statistics.getRecords().get(schema);
1✔
231
    try {
232
      writer.write(createRow(
1✔
233
        schema.getId(),
1✔
234
        schema.getField(),
1✔
235
        schema.getLocation(),
1✔
236
        '"' + schema.getSchema().replace("\"", "\"\"") + '"',
1✔
237
        '"' + schema.getAbbreviation().replace("\"", "\"\"") + '"',
1✔
238
        Utils.solarize(schema.getAbbreviation()),
1✔
239
        recordCount,
1✔
240
        instanceCount,
1✔
241
        (schema.getType() == null ? "UNKNOWN" : schema.getType())
1✔
242
      ));
UNCOV
243
    } catch (IOException | NullPointerException ex) {
×
UNCOV
244
      logger.log(Level.SEVERE, "printClassificationsBySchema", ex);
×
UNCOV
245
      logger.severe(schema.toString());
×
246
    }
1✔
247
  }
1✔
248

249
  private void printClassificationsByRecords() {
250
    Path path;
251
    path = Paths.get(parameters.getOutputDir(), "classifications-by-records.csv");
1✔
252
    try (var writer = Files.newBufferedWriter(path)) {
1✔
253
      writer.write(createRow("records-with-classification", "count"));
1✔
254
      statistics.getHasClassifications()
1✔
255
        .entrySet()
1✔
256
        .stream()
1✔
257
        .sorted((e1, e2) ->
1✔
UNCOV
258
          e2.getValue().compareTo(e1.getValue()))
×
259
        .forEach(
1✔
260
          e -> {
261
            try {
262
              writer.write(createRow(e.getKey().toString(), e.getValue()));
1✔
UNCOV
263
            } catch (IOException ex) {
×
UNCOV
264
              logger.log(Level.SEVERE, "printClassificationsByRecords", ex);
×
265
            }
1✔
266
          }
1✔
267
        );
UNCOV
268
    } catch (IOException e) {
×
UNCOV
269
      logger.log(Level.SEVERE, "printClassificationsByRecords", e);
×
270
    }
1✔
271
  }
1✔
272

273
  private void printClassificationsHistogram() {
274
    var path = Paths.get(parameters.getOutputDir(), "classifications-histogram.csv");
1✔
275
    try (var writer = Files.newBufferedWriter(path)) {
1✔
276
      writer.write(createRow("count", "frequency"));
1✔
277
      statistics.getSchemaHistogram()
1✔
278
        .entrySet()
1✔
279
        .stream()
1✔
280
        .sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
1✔
281
        .forEach(
1✔
282
          entry -> {
283
            try {
284
              writer.write(createRow(entry.getKey(), entry.getValue()));
1✔
UNCOV
285
            } catch (IOException e) {
×
UNCOV
286
              logger.log(Level.SEVERE, "printClassificationsHistogram", e);
×
287
            }
1✔
288
          }
1✔
289
        );
UNCOV
290
    } catch (IOException e) {
×
UNCOV
291
      logger.log(Level.SEVERE, "printClassificationsHistogram", e);
×
292
    }
1✔
293
  }
1✔
294

295
  private void printFrequencyExamples() {
296
    var path = Paths.get(parameters.getOutputDir(), "classifications-frequency-examples.csv");
1✔
297
    try (var writer = Files.newBufferedWriter(path)) {
1✔
298
      writer.write(createRow("count", "id"));
1✔
299
      statistics.getFrequencyExamples()
1✔
300
        .entrySet()
1✔
301
        .stream()
1✔
302
        .sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
1✔
303
        .forEach(
1✔
304
          entry -> {
305
            try {
306
              writer.write(createRow(entry.getKey(), entry.getValue()));
1✔
UNCOV
307
            } catch (IOException e) {
×
UNCOV
308
              logger.log(Level.SEVERE, "printFrequencyExamples", e);
×
309
            }
1✔
310
          }
1✔
311
        );
UNCOV
312
    } catch (IOException e) {
×
UNCOV
313
      logger.log(Level.SEVERE, "printFrequencyExamples", e);
×
314
    }
1✔
315
  }
1✔
316

317
  private void printSchemaSubfieldsStatistics() {
318
    Path path;
319
    path = Paths.get(parameters.getOutputDir(), "classifications-by-schema-subfields.csv");
1✔
320
    try (var writer = Files.newBufferedWriter(path)) {
1✔
321
      // final List<String> header = Arrays.asList("field", "location", "label", "abbreviation", "subfields", "scount");
322
      final List<String> header = Arrays.asList("id", "subfields", "count");
1✔
323
      writer.write(createRow(header));
1✔
324
      statistics.getSubfields()
1✔
325
        .entrySet()
1✔
326
        .stream()
1✔
327
        .sorted((e1, e2) ->
1✔
328
          e1.getKey().getField().compareTo(e2.getKey().getField()))
1✔
329
        .forEach(
1✔
330
          schemaEntry -> printSingleSchemaSubfieldsStatistics(writer, schemaEntry)
1✔
331
        );
UNCOV
332
    } catch (IOException e) {
×
UNCOV
333
      logger.log(Level.SEVERE, "printSchemaSubfieldsStatistics", e);
×
334
    }
1✔
335
  }
1✔
336

337
  private void printSingleSchemaSubfieldsStatistics(BufferedWriter writer, Map.Entry<Schema, Map<List<String>, Integer>> schemaEntry) {
338
    Schema schema = schemaEntry.getKey();
1✔
339
    Map<List<String>, Integer> val = schemaEntry.getValue();
1✔
340
    val
1✔
341
      .entrySet()
1✔
342
      .stream()
1✔
343
      .sorted((count1, count2) -> count2.getValue().compareTo(count1.getValue()))
1✔
344
      .forEach(
1✔
345
        countEntry -> {
346
          List<String> subfields = countEntry.getKey();
1✔
347
          int count = countEntry.getValue();
1✔
348
          try {
349
            writer.write(createRow(
1✔
350
              schema.getId(),
1✔
351
              // schema.field,
352
              // schema.location,
353
              // '"' + schema.schema.replace("\"", "\\\"") + '"',
354
              // schema.abbreviation,
355
              StringUtils.join(subfields, ';'),
1✔
356
              count
1✔
357
            ));
UNCOV
358
          } catch (IOException ex) {
×
UNCOV
359
            logger.log(Level.SEVERE, "printSingleSchemaSubfieldsStatistics", ex);
×
360
          }
1✔
361
        }
1✔
362
      );
363
  }
1✔
364

365
  @Override
366
  public void printHelp(Options options) {
367
    // Method not used
UNCOV
368
  }
×
369

370
  @Override
371
  public boolean readyToProcess() {
372
    return readyToProcess;
1✔
373
  }
374

375
  public ClassificationStatistics getStatistics() {
376
    return statistics;
1✔
377
  }
378
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc