• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pkiraly / metadata-qa-marc / #1632

02 Mar 2026 04:53PM UTC coverage: 90.198% (-0.08%) from 90.275%
#1632

push

pkiraly
compound index in Solr #740

36734 of 40726 relevant lines covered (90.2%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

40.13
/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
1
package de.gwdg.metadataqa.marc.cli;
2

3
import de.gwdg.metadataqa.marc.MarcSubfield;
4
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
5
import de.gwdg.metadataqa.marc.cli.parameters.MarcToSolrParameters;
6
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
7
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
8
import de.gwdg.metadataqa.marc.dao.DataField;
9
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
10
import de.gwdg.metadataqa.marc.datastore.MarcSolrClient;
11
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
12
import de.gwdg.metadataqa.marc.definition.general.indexer.FieldIndexer;
13
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
14
import de.gwdg.metadataqa.marc.utils.Counter;
15
import de.gwdg.metadataqa.marc.utils.SchemaSpec;
16
import de.gwdg.metadataqa.marc.utils.keygenerator.DataFieldKeyGenerator;
17
import de.gwdg.metadataqa.marc.utils.marcspec.MarcSpecExtractor;
18
import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition;
19
import de.gwdg.metadataqa.marc.utils.pica.PicaGroupIndexer;
20
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath;
21
import org.apache.commons.cli.HelpFormatter;
22
import org.apache.commons.cli.Options;
23
import org.apache.commons.cli.ParseException;
24
import org.apache.commons.lang3.StringUtils;
25
import org.apache.solr.common.SolrDocument;
26
import org.apache.solr.common.SolrInputDocument;
27
import org.marc4j.marc.Record;
28

29
import java.io.IOException;
30
import java.io.Serializable;
31
import java.nio.file.Path;
32
import java.text.DecimalFormat;
33
import java.util.ArrayList;
34
import java.util.Collections;
35
import java.util.HashMap;
36
import java.util.HashSet;
37
import java.util.List;
38
import java.util.Map;
39
import java.util.Set;
40
import java.util.logging.Logger;
41

42
/**
43
 * usage:
44
 * java -cp target/qa-catalogue-0.1-SNAPSHOT-jar-with-dependencies.jar de.gwdg.metadataqa.marc.cli.SolrKeyGenerator http://localhost:8983/solr/tardit 0001.0000000.formatted.json
45
 *
46
 */
47
public class MarcToSolr extends QACli<MarcToSolrParameters>
48
                        implements BibliographicInputProcessor, Serializable {
49

50
  private static final Logger logger = Logger.getLogger(
1✔
51
    MarcToSolr.class.getCanonicalName()
1✔
52
  );
53
  private Options options;
54
  private MarcSolrClient client;
55
  private MarcSolrClient validationClient;
56
  private Path currentFile;
57
  private boolean readyToProcess;
58
  private final DecimalFormat decimalFormat = new DecimalFormat();
1✔
59
  private FieldIndexer groupIndexer;
60
  private final Map<String, String> escapedTagCache = new HashMap<>();
1✔
61

62
  public MarcToSolr(String[] args) throws ParseException {
×
63
    parameters = new MarcToSolrParameters(args);
×
64
    initialize();
×
65
  }
×
66

67
  public MarcToSolr(MarcToSolrParameters parameters) {
1✔
68
    this.parameters = parameters;
1✔
69
    initialize();
1✔
70
  }
1✔
71

72
  private void initialize() {
73
    options = parameters.getOptions();
1✔
74

75
    client = parameters.isUseEmbedded()
1✔
76
      ? new MarcSolrClient(parameters.getMainClient())
1✔
77
      : new MarcSolrClient(parameters.getSolrUrl());
1✔
78
    client.setTrimId(parameters.getTrimId());
1✔
79
    client.indexWithTokenizedField(parameters.isIndexWithTokenizedField());
1✔
80

81
    if (parameters.getFieldPrefix() != null) {
1✔
82
      client.setFieldPrefix(parameters.getFieldPrefix());
1✔
83
    }
84

85
    if (parameters.getSolrForScoresUrl() != null) {
1✔
86
      validationClient = parameters.isUseEmbedded()
1✔
87
        ? new MarcSolrClient(parameters.getValidationClient())
1✔
88
        : new MarcSolrClient(parameters.getSolrForScoresUrl());
1✔
89
      validationClient.setTrimId(parameters.getTrimId());
1✔
90

91
      if (parameters.getFieldPrefix() != null) {
1✔
92
        validationClient.setFieldPrefix(parameters.getFieldPrefix());
1✔
93
      }
94
    }
95

96
    readyToProcess = true;
1✔
97
    initializeGroups(parameters.getGroupBy(), parameters.isPica());
1✔
98
    if (doGroups()) {
1✔
99
      groupIndexer = new PicaGroupIndexer().setPicaPath((PicaPath) groupBy);
×
100
    }
101
  }
1✔
102

103
  public static void main(String[] args) {
104
    try {
105
      MarcToSolr processor = new MarcToSolr(args);
×
106
      if (StringUtils.isBlank(((MarcToSolrParameters) processor.getParameters()).getSolrUrl())) {
×
107
        logger.severe("Please provide a Solr URL and file name!");
×
108
        System.exit(1);
×
109
      }
110

111
      RecordIterator iterator = new RecordIterator(processor);
×
112
      iterator.setProcessWithErrors(processor.getParameters().getProcessRecordsWithoutId());
×
113
      iterator.start();
×
114
      System.exit(0);
×
115
    } catch(Exception e) {
×
116
      logger.severe(() -> "ERROR. " + e.getLocalizedMessage());
×
117
      System.exit(1);
×
118
    }
×
119
  }
×
120

121
  @Override
122
  public CommonParameters getParameters() {
123
    return parameters;
1✔
124
  }
125

126
  @Override
127
  public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
128
    // do nothing
129
  }
1✔
130

131
  @Override
132
  public void processRecord(BibliographicRecord bibliographicRecord, int recordNumber, List<ValidationError> errors) throws IOException {
133
    processRecord(bibliographicRecord, recordNumber);
×
134
  }
×
135

136
  @Override
137
  public void processRecord(BibliographicRecord bibliographicRecord, int recordNumber) throws IOException {
138
    if (parameters.getRecordIgnorator().isIgnorable(bibliographicRecord))
1✔
139
      return;
×
140

141
    if (bibliographicRecord.getSchemaType().equals(SchemaType.PICA) && doGroups()) {
1✔
142
      for (DataField groupField : bibliographicRecord.getDatafieldsByTag(((PicaPath) groupBy).getTag())) {
×
143
        groupField.addFieldIndexer(groupIndexer);
×
144
      }
×
145
    }
146

147
    Map<String, List<String>> keyValuePairs = bibliographicRecord.getKeyValuePairs(
1✔
148
      parameters.getSolrFieldType(), true, parameters.getMarcVersion()
1✔
149
    );
150

151
    // Add the record itself as a field to the index
152
    keyValuePairs.put("record_sni", Collections.singletonList(bibliographicRecord.asJson()));
1✔
153

154
    // logger.info(bibliographicRecord.getId());
155
    SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), keyValuePairs);
1✔
156
    if (validationClient != null) {
1✔
157
      indexValidationResults(bibliographicRecord, solrDocument);
1✔
158
    }
159

160
    if (parameters.isIndexFieldCounts() || parameters.isIndexSubfieldCounts()) {
1✔
161
      indexFieldCounts(bibliographicRecord, solrDocument);
×
162
    }
163

164

165
    if (parameters.getCompoundFields() != null) {
1✔
166
      indexCompoundFields(bibliographicRecord, solrDocument);
×
167
    } else {
168

169
    }
170

171
    try {
172
      client.index(solrDocument);
1✔
173
    } catch (Exception e) {
×
174
      logger.severe(() -> "ERROR while index." + e.getLocalizedMessage());
×
175
    }
1✔
176

177
    if (recordNumber % parameters.getCommitAt() != 0) {
1✔
178
      return;
1✔
179
    }
180

181
    if (parameters.isDoCommit()) {
×
182
      logger.info("do commit @" + recordNumber);
×
183
      client.commit();
×
184
      long indexedRecordCount = client.getCount();
×
185
      if (recordNumber != indexedRecordCount) {
×
186
        logger.severe(String.format("recordNumber: %d != indexedRecordCount: %d", recordNumber, indexedRecordCount));
×
187
      }
188
      logger.info("/do commit @" + recordNumber);
×
189
    }
190

191
    String logMessage = String.format(
×
192
      "%s/%s (%s)",
193
      currentFile.getFileName().toString(),
×
194
      decimalFormat.format(recordNumber),
×
195
      bibliographicRecord.getId()
×
196
    );
197
    logger.info(logMessage);
×
198
  }
×
199

200
  private void indexCompoundFields(BibliographicRecord bibliographicRecord,
201
                                   SolrInputDocument solrDocument) {
202
    for (Map.Entry<String, List<SchemaSpec>> compoundEntry : parameters.getCompoundFields().entrySet()) {
×
203
      String solrField = String.format("%s%s_ss", parameters.getFieldPrefix(), compoundEntry.getKey());
×
204
      Set<String> allValues = new HashSet<>();
×
205
      for (SchemaSpec bibliographicField : compoundEntry.getValue()) {
×
206
        List<String> values = bibliographicRecord.select(bibliographicField);
×
207
        if (!values.isEmpty()) {
×
208
          // logger.info(solrField + " -> " + values.getClass());
209
          allValues.addAll(values);
×
210
        }
211
      }
×
212
      solrDocument.addField(solrField, allValues);
×
213
    }
×
214
  }
×
215

216
  private void indexValidationResults(BibliographicRecord bibliographicRecord,
217
                                      SolrInputDocument document) {
218
    SolrDocument validationValues = validationClient.get(bibliographicRecord.getId());
1✔
219
    if (validationValues == null || validationValues.isEmpty()) {
1✔
220
      return;
×
221
    }
222

223
    for (String field : validationValues.getFieldNames()) {
1✔
224
      document.addField(field, validationValues.getFieldValues(field));
1✔
225
    }
1✔
226
  }
1✔
227

228
  /**
229
   * Index field and subfield counts. The solr field will look like <tag>_count_i and
230
   * <tag><subfield code>_count_i, the value will be the number of times this element is
231
   * available in the record.
232
   *
233
   * @param bibliographicRecord The bibliographic record
234
   * @param document The Solr document
235
   */
236
  private void indexFieldCounts(BibliographicRecord bibliographicRecord,
237
                                SolrInputDocument document) {
238
    Counter<String> counter = new Counter<>();
×
239
    boolean isPica = bibliographicRecord.getSchemaType().equals(SchemaType.PICA);
×
240
    Map<String, List<Integer>> subfields = new HashMap<>();
×
241
    for (DataField field : bibliographicRecord.getDatafields()) {
×
242
      String tag;
243
      if (field.getDefinition() != null) {
×
244
        tag = isPica
×
245
          ? ((PicaFieldDefinition)field.getDefinition()).getId()
×
246
          : field.getDefinition().getTag();
×
247
      } else {
248
        tag = field.getTag();
×
249
      }
250
      String safeTag = escape(tag);
×
251
      if (parameters.isIndexFieldCounts())
×
252
        counter.count(safeTag);
×
253

254
      if (parameters.isIndexSubfieldCounts()) {
×
255
        Counter<String> subfieldCounter = new Counter<>();
×
256
        for (MarcSubfield subfield : field.getSubfields()) {
×
257
          String safeSubfieldCode = DataFieldKeyGenerator.escape(subfield.getCode());
×
258
          subfieldCounter.count(safeTag + safeSubfieldCode);
×
259
        }
×
260
        for (Map.Entry<String, Integer> entry : subfieldCounter.entrySet()) {
×
261
          if (!subfields.containsKey(entry.getKey()))
×
262
            subfields.put(entry.getKey(), new ArrayList<>());
×
263
          subfields.get(entry.getKey()).add(entry.getValue());
×
264
        }
×
265
      }
266
    }
×
267
    for (Map.Entry<String, Integer> entry : counter.entrySet()) {
×
268
      document.addField(String.format(
×
269
        "%s%s_count_i",
270
        parameters.getFieldPrefix(), entry.getKey()), entry.getValue());
×
271
    }
×
272

273
    if (parameters.isIndexSubfieldCounts()) {
×
274
      for (Map.Entry<String, List<Integer>> entry : subfields.entrySet()) {
×
275
        document.addField(String.format(
×
276
          "%s%s_count_is",
277
          parameters.getFieldPrefix(), entry.getKey()), entry.getValue());
×
278
      }
×
279
    }
280
  }
×
281

282
  private String escape(String tag) {
283
    escapedTagCache.putIfAbsent(tag, DataFieldKeyGenerator.escape(tag));
×
284
    return escapedTagCache.get(tag);
×
285
  }
286

287
  @Override
288
  public void beforeIteration() {
289
    logger.info(() -> parameters.formatParameters());
1✔
290
    parameters.setMainClient(null);
1✔
291
    parameters.setValidationClient(null);
1✔
292
  }
1✔
293

294
  @Override
295
  public void fileOpened(Path path) {
296
    currentFile = path;
1✔
297
  }
1✔
298

299
  @Override
300
  public void fileProcessed() {
301
    // Do nothing
302
  }
×
303

304
  @Override
305
  public void afterIteration(int numberOfprocessedRecords, long duration) {
306
    client.commit();
1✔
307
    logger.info(parameters.toString());
1✔
308
    saveParameters(
1✔
309
      "marctosolr.params.json",
310
      parameters,
311
      Map.of(
1✔
312
        "numberOfprocessedRecords", numberOfprocessedRecords,
1✔
313
        "duration", duration
1✔
314
      )
315
    );
316
  }
1✔
317

318
  @Override
319
  public void printHelp(Options options) {
320
    HelpFormatter formatter = new HelpFormatter();
×
321
    String message = String.format("java -cp qa-catalogue.jar %s [options] [file]", this.getClass().getCanonicalName());
×
322
    formatter.printHelp(message, options);
×
323
  }
×
324

325
  @Override
326
  public boolean readyToProcess() {
327
    return readyToProcess;
1✔
328
  }
329
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc