• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pkiraly / metadata-qa-marc / #1527

22 Aug 2025 02:21PM UTC coverage: 90.345%. Remained the same
#1527

push

pkiraly
Improve timeline handling

5191 of 6416 new or added lines in 219 files covered. (80.91%)

886 existing lines in 78 files now uncovered.

36717 of 40641 relevant lines covered (90.34%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.78
/src/main/java/de/gwdg/metadataqa/marc/utils/unimarc/UnimarcSchemaReader.java
1
package de.gwdg.metadataqa.marc.utils.unimarc;
2

3
import de.gwdg.metadataqa.marc.EncodedValue;
4
import de.gwdg.metadataqa.marc.definition.structure.ControlfieldPositionDefinition;
5
import de.gwdg.metadataqa.marc.definition.structure.Indicator;
6
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
7
import net.minidev.json.JSONObject;
8
import net.minidev.json.parser.JSONParser;
9
import net.minidev.json.parser.ParseException;
10

11
import java.io.FileNotFoundException;
12
import java.io.FileReader;
13
import java.io.InputStream;
14
import java.io.InputStreamReader;
15
import java.nio.charset.StandardCharsets;
16
import java.util.ArrayList;
17
import java.util.HashMap;
18
import java.util.List;
19
import java.util.Map;
20
import java.util.logging.Logger;
21
import java.util.regex.Pattern;
22

23
/**
24
 * Reads a UNIMARC schema from a JSON file, so that it can be used to process UNIMARC records.
25
 */
26
public class UnimarcSchemaReader {
1✔
27
  private static final Logger logger = Logger.getLogger(UnimarcSchemaReader.class.getCanonicalName());
1✔
28
  private static final String LABEL = "label";
29
  private static final String TAG = "tag";
30
  private static final String REQUIRED = "required";
31
  private static final String REPEATABLE = "repeatable";
32
  private static final String INDICATOR_1 = "indicator1";
33
  private static final String INDICATOR_2 = "indicator2";
34
  private static final String SUBFIELDS = "subfields";
35
  private static final String POSITIONS = "positions";
36
  private static final String CODES = "codes";
37
  private static final String FLAGS = "flags";
38
  private static final String CODELIST = "codelist";
39
  private static final String PATTERN = "pattern";
40
  private static final String START = "start";
41
  private static final String END = "end";
42

43
  private static final Map<String, Integer> knownFieldProperties = Map.of(
1✔
44
      LABEL, 1,
1✔
45
      TAG, 1,
1✔
46
      REQUIRED, 1,
1✔
47
      REPEATABLE, 1,
1✔
48
      INDICATOR_1, 1,
1✔
49
      INDICATOR_2, 1,
1✔
50
      SUBFIELDS, 1,
1✔
51
      POSITIONS, 1);
1✔
52
  private static final Map<String, Integer> knownSubfieldProperties = Map.of(
1✔
53
      LABEL, 1,
1✔
54
      REPEATABLE, 1,
1✔
55
      CODELIST, 1,
1✔
56
      POSITIONS, 1,
1✔
57
      CODES, 1);
1✔
58
  private static final Map<String, Integer> knownIndicatorProperties = Map.of(
1✔
59
      LABEL, 1,
1✔
60
      CODES,1);
1✔
61

62
  private final JSONParser parser = new JSONParser(JSONParser.MODE_RFC4627);
1✔
63
  private final UnimarcSchemaManager schema = new UnimarcSchemaManager();
1✔
64
  private final Map<String, List<EncodedValue>> codeLists = new HashMap<>();
1✔
65

66
  public UnimarcSchemaManager createSchema(InputStream inputStream) {
67
    try {
NEW
68
      JSONObject jsonObject = readStream(inputStream);
×
NEW
69
      processCodeLists(jsonObject);
×
NEW
70
      processFields(jsonObject);
×
NEW
71
    } catch (ParseException e) {
×
NEW
72
      logger.severe(e.getLocalizedMessage());
×
NEW
73
    }
×
74

NEW
75
    return schema;
×
76
  }
77

78
  public UnimarcSchemaManager createSchema(String filename) {
79
    try {
80
      JSONObject jsonObject = readFile(filename);
1✔
81
      processCodeLists(jsonObject);
1✔
82
      processFields(jsonObject);
1✔
NEW
83
    } catch (FileNotFoundException | ParseException e) {
×
NEW
84
      logger.severe(e.getLocalizedMessage());
×
85
    }
1✔
86

87
    return schema;
1✔
88
  }
89

90
  /**
91
   * Used to load all code lists from the JSON object and store them in the codeLists map. The code lists are going to
92
   * be used to resolve the codes in subfields or positions.
93
   * @param jsonObject The JSON object of the schema which is assumed to contain the codelists object.
94
   */
95
  private void processCodeLists(JSONObject jsonObject) {
96
    JSONObject fields = (JSONObject) jsonObject.get("codelists");
1✔
97
    for (Map.Entry<String, Object> entry : fields.entrySet()) {
1✔
98
      String codeListName = entry.getKey();
1✔
99
      JSONObject properties = (JSONObject) entry.getValue();
1✔
100
      List<EncodedValue> codeList = processCodes((JSONObject) properties.get(CODES));
1✔
101
      codeLists.put(codeListName, codeList);
1✔
102
    }
1✔
103
  }
1✔
104

105
  private JSONObject readFile(String filename) throws FileNotFoundException, ParseException {
106
    FileReader reader = new FileReader(filename);
1✔
107
    return (JSONObject) parser.parse(reader);
1✔
108
  }
109

110
  private JSONObject readStream(InputStream stream) throws ParseException {
NEW
111
    InputStreamReader streamReader = new InputStreamReader(stream, StandardCharsets.UTF_8);
×
NEW
112
    return (JSONObject) parser.parse(streamReader);
×
113
  }
114

115
  private void processFields(JSONObject obj) {
116
    JSONObject fields = (JSONObject) obj.get("fields");
1✔
117
    for (Map.Entry<String, Object> entry : fields.entrySet()) {
1✔
118
      String tag = entry.getKey();
1✔
119
      JSONObject jsonField = (JSONObject) fields.get(tag);
1✔
120

121
      // If the tag is 'LEADER', then create a special field definition for the leader which is essentially a
122
      // ControlfieldDefinition. If the tag is not 'LEADER', then create a normal field definition.
123

124
      if (tag.equals("LEADER")) {
1✔
125
        UnimarcLeaderDefinition leaderDefinition = createLeaderDefinition(jsonField);
1✔
126
        schema.setLeaderDefinition(leaderDefinition);
1✔
127
        continue;
1✔
128
      }
129

130
      UnimarcFieldDefinition fieldDefinition = createFieldDefinition(tag, jsonField);
1✔
131

132
      if (schema.lookup(tag) != null) {
1✔
NEW
133
        logger.warning(() -> "duplicate field definition for tag: " + tag);
×
134
      }
135
      // Take the last definition of a field, as it is the most recent one
136
      schema.add(fieldDefinition);
1✔
137
    }
1✔
138
  }
1✔
139

140
  private UnimarcLeaderDefinition createLeaderDefinition(JSONObject jsonField) {
141
    UnimarcLeaderDefinition leaderDefinition = new UnimarcLeaderDefinition();
1✔
142
    // Set position definitions
143
    List<ControlfieldPositionDefinition> positions = getPositions(jsonField, "leader");
1✔
144
    leaderDefinition.setControlfieldPositions(positions);
1✔
145
    return leaderDefinition;
1✔
146
  }
147

148
  private UnimarcFieldDefinition createFieldDefinition(String tag, JSONObject jsonField) {
149
    // In this situation, it isn't necessary to access the JSON value of 'tag' directly,
150
    // as it is already available as the key of the UNIMARC field.
151
    UnimarcFieldDefinition fieldDefinition = new UnimarcFieldDefinition(
1✔
152
        tag,
153
        (String) jsonField.get(LABEL),
1✔
154
        jsonField.get(REPEATABLE) != null && (boolean) jsonField.get(REPEATABLE),
1✔
155
        jsonField.get(REQUIRED) != null && (boolean) jsonField.get(REQUIRED)
1✔
156
    );
157

158
    Indicator indicator1 = getIndicator(1, jsonField);
1✔
159
    fieldDefinition.setInd1(indicator1);
1✔
160
    Indicator indicator2 = getIndicator(2, jsonField);
1✔
161
    fieldDefinition.setInd2(indicator2);
1✔
162
    List<SubfieldDefinition> subfieldDefinitions = getSubfields(jsonField, tag);
1✔
163
    fieldDefinition.setSubfieldDefinitions(subfieldDefinitions);
1✔
164

165
    // Log all unhandled field properties
166
    for (String property : jsonField.keySet()) {
1✔
167
      if (!knownFieldProperties.containsKey(property)) {
1✔
NEW
168
        logger.warning(() -> "unhandled property for field " + tag + ": " + property);
×
169
      }
170
    }
1✔
171

172
    return fieldDefinition;
1✔
173
  }
174

175
  /**
176
   * Retrieves the indicator from the JSON object. An in
177
   * @param indicatorNumber Either 1 or 2
178
   * @param jsonField The JSON object of the field
179
   */
180
  private Indicator getIndicator(int indicatorNumber, JSONObject jsonField) {
181
    String indicatorKey = "indicator" + indicatorNumber;
1✔
182
    JSONObject jsonIndicator = (JSONObject) jsonField.get(indicatorKey);
1✔
183
    if (jsonIndicator == null) {
1✔
184
      // Return an empty indicator which represent the empty values in order to conform with MARC21 fields
185
      return new Indicator();
1✔
186
    }
187

188
    Indicator indicator = new Indicator((String) jsonIndicator.get(LABEL));
1✔
189
    List<EncodedValue> codes = getValueExpressions(jsonIndicator);
1✔
190
    indicator.setCodes(codes);
1✔
191

192
    // Log all unhandled indicator properties
193
    for (String property : jsonIndicator.keySet()) {
1✔
194
      if (!knownIndicatorProperties.containsKey(property)) {
1✔
NEW
195
        logger.warning(() -> "unhandled indicator property: " + property);
×
196
      }
197
    }
1✔
198

199
    return indicator;
1✔
200
  }
201

202
  private List<SubfieldDefinition> getSubfields(JSONObject jsonField, String parentTag) {
203
    // Subfields are a JSON object in our schema
204
    JSONObject subfields = (JSONObject) jsonField.get(SUBFIELDS);
1✔
205
    if (subfields == null) {
1✔
206
      return List.of();
1✔
207
    }
208

209
    List<SubfieldDefinition> subfieldDefinitions = new ArrayList<>();
1✔
210

211
    for (Map.Entry<String, Object> entry : subfields.entrySet()) {
1✔
212
      String code = entry.getKey();
1✔
213
      // Avoid personally defined JSON comments
214
      if (code.startsWith("//")) {
1✔
NEW
215
        continue;
×
216
      }
217

218
      JSONObject jsonSubfield = (JSONObject) subfields.get(code);
1✔
219
      // In this situation, it isn't necessary to access the JSON value of 'code' directly,
220
      // as it is already available as the key of the UNIMARC subfield.
221
      Object repeatable = jsonSubfield.get(REPEATABLE);
1✔
222

223
      SubfieldDefinition subfieldDefinition = new SubfieldDefinition(
1✔
224
          code,
225
          (String) jsonSubfield.get(LABEL),
1✔
226
          repeatable != null && (boolean) repeatable
1✔
227
      );
228

229
      List<EncodedValue> valueExpressions = getValueExpressions(jsonSubfield);
1✔
230
      subfieldDefinition.setCodes(valueExpressions);
1✔
231

232
      String subfieldTag = String.format("%s$%s", parentTag, code);
1✔
233

234
      List<ControlfieldPositionDefinition> positions = getPositions(jsonSubfield, subfieldTag);
1✔
235
      subfieldDefinition.setPositions(positions);
1✔
236

237
      // Check if the subfield is a duplicate
238
      if (subfieldDefinitions.stream().anyMatch(subfield -> subfield.getCode().equals(code))) {
1✔
NEW
239
        logger.warning(() -> "duplicate subfield definition for tag: " + subfieldTag);
×
240
      }
241
      subfieldDefinitions.add(subfieldDefinition);
1✔
242

243
      // Log all unhandled subfield properties
244
      for (String property : jsonSubfield.keySet()) {
1✔
245
        if (!knownSubfieldProperties.containsKey(property)) {
1✔
NEW
246
          logger.warning(() -> String.format("%s$%s unhandled subfield property: %s", parentTag, code, property));
×
247
        }
248
      }
1✔
249
    }
1✔
250
    return subfieldDefinitions;
1✔
251
  }
252

253
  private List<ControlfieldPositionDefinition> getPositions(JSONObject positionParent, String parentTag) {
254
    JSONObject positions = (JSONObject) positionParent.get(POSITIONS);
1✔
255
    if (positions == null) {
1✔
256
      return List.of();
1✔
257
    }
258
    List<ControlfieldPositionDefinition> positionDefinitions = new ArrayList<>();
1✔
259
    for (Map.Entry<String, Object> positionEntry : positions.entrySet()) {
1✔
260

261
      if (positionEntry.getKey().startsWith("//")) {
1✔
NEW
262
        continue;
×
263
      }
264

265
      JSONObject position = (JSONObject) positionEntry.getValue();
1✔
266

267
      int positionStart = (int) position.get(START);
1✔
268
      Object positionEndObject = position.get(END);
1✔
269

270
      // As the implementation of ControlfieldPositionDefinition requires a positionEnd, and it seems
271
      // to be slightly different to what is specified in the UNIMARC manuals, we add 1 to the positionEnd
272
      int positionEnd = (positionEndObject == null ? positionStart : (int) positionEndObject) + 1;
1✔
273

274
      ControlfieldPositionDefinition positionDefinition = new ControlfieldPositionDefinition(
1✔
275
          (String) position.get(LABEL),
1✔
276
          positionStart,
277
          positionEnd
278
      );
279
      String positionKey = positionEntry.getKey();
1✔
280
      // Check position places. This doesn't produce any side effects except for log warnings.
281
      checkPositionPlaces(positionKey, positionStart, positionEndObject);
1✔
282
      String positionId = String.format("%s/%s", parentTag, positionEntry.getKey());
1✔
283
      positionDefinition.setId(positionId);
1✔
284

285
      assignPositionValueExpressions(position, positionDefinition);
1✔
286

287
      positionDefinitions.add(positionDefinition);
1✔
288
    }
1✔
289
    return positionDefinitions;
1✔
290
  }
291

292
  private void assignPositionValueExpressions(JSONObject position, ControlfieldPositionDefinition positionDefinition) {
293
    List<EncodedValue> codes = getCodes(position, CODES);
1✔
294
    if (!codes.isEmpty()) {
1✔
295
      positionDefinition.setCodes(codes);
1✔
296

297
      positionDefinition.setRepeatableContent(false);
1✔
298
      // In case there are codes, don't check for any other value-defining properties such as flags or patterns
299
      return;
1✔
300
    }
301

302
    // In case there are no codes, check for flags and a pattern
303
    // Flags make the position repeatable
304
    // Patterns are used to validate the content of the position
305
    // Both flags and the pattern are represented as EncodedValues
306
    codes = getCodes(position, FLAGS);
1✔
307
    if (!codes.isEmpty()) {
1✔
308
      positionDefinition.setCodes(codes);
1✔
309
      positionDefinition.setRepeatableContent(true);
1✔
310

311
      int unitLength = codes.get(0).getCode().length();
1✔
312
      positionDefinition.setUnitLength(unitLength);
1✔
313
    }
314

315
    EncodedValue pattern = getPattern(position);
1✔
316

317
    // The pattern can also contain groups, which are used to extract the value from the position
318
    // For example: ^(0[1-9]|[1-9][0-9])$|^(xx)$
319
    // Semantics of the first and the second group are defined in the schema in a "groups" object
320
    if (pattern != null) {
1✔
321
      assignGroupsToPattern(position, pattern);
1✔
322
      codes.add(pattern);
1✔
323
    }
324

325
    positionDefinition.setCodes(codes);
1✔
326
  }
1✔
327

328
  private List<EncodedValue> getValueExpressions(JSONObject subfield) {
329
    List<EncodedValue> codes = getCodes(subfield, CODES);
1✔
330
    if (!codes.isEmpty()) {
1✔
331
      // In case there are codes, don't check for any other value-defining properties such as flags or patterns
332
      return codes;
1✔
333
    }
334

335
    // In case there are no codes, check for a pattern
336
    // Patterns are used to validate the content of the position
337
    EncodedValue pattern = getPattern(subfield);
1✔
338

339
    // The pattern can also contain groups, which are used to extract the value from the position
340
    // For example: ^(0[1-9]|[1-9][0-9])$|^(xx)$
341
    // Semantics of the first and the second group are defined in the schema in a "groups" object
342
    if (pattern != null) {
1✔
NEW
343
      assignGroupsToPattern(subfield, pattern);
×
NEW
344
      codes.add(pattern);
×
345
    }
346

347
    return codes;
1✔
348
  }
349

350
  private void checkPositionPlaces(String key, int positionStart, Object positionEndObject) {
351
    // Compare the key with the postitionStart and positionEnd. The key should be in the format positionStart-positionEnd,
352
    // or only positionStart if positionEnd is null
353
    String[] keyParts = key.split("-");
1✔
354

355
    if (keyParts.length == 1) {
1✔
356
      // If the key is only one part, then it should be equal to the positionStart int
357
      if (Integer.parseInt(keyParts[0]) != positionStart) {
1✔
NEW
358
        logger.warning(() -> String.format("positionStart (%s) and key (%s) don't match", positionStart, key));
×
359
      }
360
      return;
1✔
361
    }
362

363
    if (keyParts.length != 2) {
1✔
NEW
364
      logger.warning(() -> String.format("key (%s) is not in the format of positionStart-positionEnd", key));
×
NEW
365
      return;
×
366
    }
367

368
    // If the key is two parts, then the first part should be equal to the positionStart and the second part
369
    // should be equal to the positionEnd
370
    if (Integer.parseInt(keyParts[0]) != positionStart) {
1✔
NEW
371
      logger.warning(() -> String.format("positionStart (%s) and key (%s) don't match", positionStart, key));
×
372
    }
373

374
    if (positionEndObject == null) {
1✔
NEW
375
      logger.warning(() -> String.format("positionEnd (%s) and key (%s) don't match", positionEndObject, key));
×
NEW
376
      return;
×
377
    }
378

379
    int positionEnd = (int) positionEndObject;
1✔
380

381
    if (Integer.parseInt(keyParts[1]) != positionEnd) {
1✔
NEW
382
      logger.warning(() -> String.format("positionEnd (%s) and key (%s) don't match", positionEndObject, key));
×
383
    }
384
  }
1✔
385

386
  /**
387
   * Retrieves the codes from the JSON object.
388
   * Codes are in format of am object "key": "value", where the key is the code and the value is the label.
389
   * Codes can also be in form of a codelist, which is a reference to a list of codes loaded from the codelists object
390
   * of the same schema.
391
   * @param codesHolder Meant to be either an indicator, a subfield or a position
392
   * @param objectKey The key of the codes object, "codes" or "codelist"
393
   * @return The list of codes for the respective codesHolder
394
   */
395
  private List<EncodedValue> getCodes(JSONObject codesHolder, String objectKey) {
396
    Object listValue = codesHolder.get(objectKey);
1✔
397
    if (listValue instanceof String) {
1✔
398
      return codeLists.computeIfAbsent((String) listValue, s -> new ArrayList<>());
1✔
399
    }
400

401
    JSONObject codes = (JSONObject) listValue;
1✔
402
    if (codes == null) {
1✔
403
      return new ArrayList<>();
1✔
404
    }
405
    return processCodes(codes);
1✔
406
  }
407

408
  private EncodedValue getPattern(JSONObject position) {
409
    String pattern = (String) position.get(PATTERN);
1✔
410
    if (pattern == null) {
1✔
411
      return null;
1✔
412
    }
413

414
    // Pattern is a regular expression, so we need to check if it is valid
415
    try {
416
      Pattern.compile(pattern);
1✔
NEW
417
    } catch (Exception e) {
×
NEW
418
      logger.warning(() -> "invalid pattern: " + pattern);
×
NEW
419
      return null;
×
420
    }
1✔
421

422
    EncodedValue codePattern = new EncodedValue(pattern, PATTERN);
1✔
423
    codePattern.setRegex(true);
1✔
424

425
    return codePattern;
1✔
426
  }
427

428
  private void assignGroupsToPattern(JSONObject position, EncodedValue codePattern) {
429
    JSONObject groups = (JSONObject) position.get("groups");
1✔
430
    if (groups == null) {
1✔
NEW
431
      return;
×
432
    }
433

434
    Map<Integer, String> regexGroups = new HashMap<>();
1✔
435
    for (Map.Entry<String, Object> groupEntry : groups.entrySet()) {
1✔
436
      try {
437
        int groupNumber = Integer.parseInt(groupEntry.getKey());
1✔
438
        // groupEntry.getValue() is an object containing the label of the group
439
        JSONObject groupBody = (JSONObject) groupEntry.getValue();
1✔
440
        String groupLabel = (String) groupBody.get(LABEL);
1✔
441
        regexGroups.put(groupNumber, groupLabel);
1✔
NEW
442
      } catch (NumberFormatException e) {
×
443
        // If the group number is not a number, then it is invalid and no group is added
NEW
444
        logger.warning(() -> "invalid group number: " + groupEntry.getKey());
×
NEW
445
        return;
×
446
      }
1✔
447
    }
1✔
448
    codePattern.setRegexGroups(regexGroups);
1✔
449
  }
1✔
450

451
  private List<EncodedValue> processCodes(JSONObject codes) {
452
    List<EncodedValue> encodedValues = new ArrayList<>();
1✔
453
    for (Map.Entry<String, Object> codeEntry : codes.entrySet()) {
1✔
454
      String code = codeEntry.getKey();
1✔
455
      String codeLabel = (String) codeEntry.getValue();
1✔
456

457
      if (code.startsWith("//")) {
1✔
NEW
458
        continue;
×
459
      }
460

461
      addCode(encodedValues, code, codeLabel);
1✔
462

463
      // Code ranges were abolished in favor of patterns
464
    }
1✔
465

466
    return encodedValues;
1✔
467
  }
468

469
  private void addCode(List<EncodedValue> encodedValues, String code, String codeLabel) {
470
    EncodedValue encodedValue = new EncodedValue(code, codeLabel);
1✔
471
    encodedValues.add(encodedValue);
1✔
472
  }
1✔
473
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc