• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IQSS / dataverse / #24060

16 Dec 2024 01:30PM UTC coverage: 22.573% (+0.004%) from 22.569%
#24060

push

github

stevenwinship
Merge branch 'develop' into 10714-access-requests-missing-since-upgrade-v6-0

2 of 18 new or added lines in 5 files covered. (11.11%)

3 existing lines in 1 file now uncovered.

19544 of 86582 relevant lines covered (22.57%)

0.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

16.35
/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
1
package edu.harvard.iq.dataverse.search;
2

3
import edu.harvard.iq.dataverse.*;
4
import edu.harvard.iq.dataverse.DatasetVersion.VersionState;
5
import edu.harvard.iq.dataverse.DvObject.DType;
6
import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean;
7
import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean;
8
import edu.harvard.iq.dataverse.batch.util.LoggingUtil;
9
import edu.harvard.iq.dataverse.dataaccess.DataAccess;
10
import edu.harvard.iq.dataverse.dataaccess.DataAccessRequest;
11
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
12
import edu.harvard.iq.dataverse.dataset.DatasetType;
13
import edu.harvard.iq.dataverse.datavariable.DataVariable;
14
import edu.harvard.iq.dataverse.datavariable.VariableMetadata;
15
import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil;
16
import edu.harvard.iq.dataverse.datavariable.VariableServiceBean;
17
import edu.harvard.iq.dataverse.harvest.client.HarvestingClient;
18
import edu.harvard.iq.dataverse.search.IndexableDataset.DatasetState;
19
import edu.harvard.iq.dataverse.settings.FeatureFlags;
20
import edu.harvard.iq.dataverse.settings.JvmSettings;
21
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
22
import edu.harvard.iq.dataverse.util.FileUtil;
23
import edu.harvard.iq.dataverse.util.StringUtil;
24
import edu.harvard.iq.dataverse.util.SystemConfig;
25
import java.io.IOException;
26
import java.io.InputStream;
27
import java.sql.Timestamp;
28
import java.text.SimpleDateFormat;
29
import java.time.LocalDate;
30
import java.util.ArrayList;
31
import java.util.Calendar;
32
import java.util.Collection;
33
import java.util.Date;
34
import java.util.HashMap;
35
import java.util.HashSet;
36
import java.util.LinkedHashMap;
37
import java.util.List;
38
import java.util.Locale;
39
import java.util.Map;
40
import java.util.Set;
41
import java.util.concurrent.ConcurrentHashMap;
42
import java.util.concurrent.Future;
43
import java.util.concurrent.Semaphore;
44
import java.util.function.Function;
45
import java.util.logging.Level;
46
import java.util.logging.Logger;
47
import java.util.stream.Collectors;
48
import jakarta.annotation.PostConstruct;
49
import jakarta.annotation.PreDestroy;
50
import jakarta.ejb.AsyncResult;
51
import jakarta.ejb.Asynchronous;
52
import jakarta.ejb.EJB;
53
import jakarta.ejb.EJBException;
54
import jakarta.ejb.Stateless;
55
import jakarta.ejb.TransactionAttribute;
56
import static jakarta.ejb.TransactionAttributeType.REQUIRES_NEW;
57

58
import jakarta.inject.Inject;
59
import jakarta.inject.Named;
60
import jakarta.json.JsonObject;
61
import jakarta.persistence.EntityManager;
62
import jakarta.persistence.PersistenceContext;
63

64
import org.apache.commons.io.IOUtils;
65
import org.apache.commons.lang3.StringUtils;
66
import org.apache.solr.client.solrj.SolrClient;
67
import org.apache.solr.client.solrj.SolrQuery;
68
import org.apache.solr.client.solrj.SolrQuery.SortClause;
69
import org.apache.solr.client.solrj.SolrServerException;
70
import org.apache.solr.client.solrj.impl.HttpSolrClient;
71
import org.apache.solr.client.solrj.response.QueryResponse;
72
import org.apache.solr.client.solrj.response.UpdateResponse;
73
import org.apache.solr.common.SolrDocument;
74
import org.apache.solr.common.SolrDocumentList;
75
import org.apache.solr.common.SolrInputDocument;
76
import org.apache.solr.common.params.CursorMarkParams;
77
import org.apache.tika.parser.AutoDetectParser;
78
import org.apache.tika.metadata.Metadata;
79
import org.apache.tika.parser.ParseContext;
80
import org.apache.tika.sax.BodyContentHandler;
81
import org.eclipse.microprofile.config.Config;
82
import org.eclipse.microprofile.config.ConfigProvider;
83
import org.eclipse.microprofile.metrics.MetricUnits;
84
import org.eclipse.microprofile.metrics.Timer;
85
import org.eclipse.microprofile.metrics.annotation.Metric;
86
import org.xml.sax.ContentHandler;
87

88
@Stateless
89
@Named
90
public class IndexServiceBean {
1✔
91

92
    private static final Logger logger = Logger.getLogger(IndexServiceBean.class.getCanonicalName());
1✔
93
    private static final Config config = ConfigProvider.getConfig();
1✔
94

95
    @PersistenceContext(unitName = "VDCNet-ejbPU")
96
    private EntityManager em;
97

98
    @EJB
99
    DvObjectServiceBean dvObjectService;
100
    @EJB
101
    DataverseServiceBean dataverseService;
102
    @EJB
103
    DatasetServiceBean datasetService;
104
    @EJB
105
    DatasetVersionServiceBean datasetVersionService;
106
    @EJB
107
    BuiltinUserServiceBean dataverseUserServiceBean;
108
    @EJB
109
    PermissionServiceBean permissionService;
110
    @EJB
111
    AuthenticationServiceBean userServiceBean;
112
    @EJB
113
    SystemConfig systemConfig;
114
    @EJB
115
    SearchPermissionsServiceBean searchPermissionsService;
116
    @EJB
117
    SolrIndexServiceBean solrIndexService;
118
    @EJB
119
    DatasetLinkingServiceBean dsLinkingService;
120
    @EJB
121
    DataverseLinkingServiceBean dvLinkingService;
122
    @EJB
123
    SettingsServiceBean settingsService;
124
    @EJB
125
    SolrClientService solrClientService;
126
    @EJB
127
    DataFileServiceBean dataFileService;
128

129
    @EJB
130
    VariableServiceBean variableService;
131
    
132
    @EJB
133
    IndexBatchServiceBean indexBatchService;
134
    
135
    @EJB
136
    DatasetFieldServiceBean datasetFieldService;
137

138
    @Inject
139
    DatasetVersionFilesServiceBean datasetVersionFilesServiceBean;
140

141
    public static final String solrDocIdentifierDataverse = "dataverse_";
142
    public static final String solrDocIdentifierFile = "datafile_";
143
    public static final String solrDocIdentifierDataset = "dataset_";
144
    public static final String draftSuffix = "_draft";
145
    public static final String deaccessionedSuffix = "_deaccessioned";
146
    public static final String discoverabilityPermissionSuffix = "_permission";
147
    private static final String groupPrefix = "group_";
148
    private static final String groupPerUserPrefix = "group_user";
149
    private static final String publicGroupIdString = "public";
150
    private static final String publicGroupString = groupPrefix + "public";
151
    public static final String PUBLISHED_STRING = "Published";
152
    private static final String UNPUBLISHED_STRING = "Unpublished";
153
    private static final String DRAFT_STRING = "Draft";
154
    private static final String IN_REVIEW_STRING = "In Review";
155
    private static final String DEACCESSIONED_STRING = "Deaccessioned";
156
    public static final String HARVESTED = "Harvested";
157
    private String rootDataverseName;
158
    private Dataverse rootDataverseCached;
159
    SolrClient solrServer;
160

161
    private VariableMetadataUtil variableMetadataUtil;
162

163
    @PostConstruct
164
    public void init() {
165
        // Get from MPCONFIG. Might be configured by a sysadmin or simply return the default shipped with
166
        // resources/META-INF/microprofile-config.properties.
167
        String protocol = JvmSettings.SOLR_PROT.lookup();
1✔
168
        String path = JvmSettings.SOLR_PATH.lookup();
1✔
169
    
170
        String urlString = protocol + "://" + systemConfig.getSolrHostColonPort() + path;
1✔
171
        solrServer = new HttpSolrClient.Builder(urlString).build();
1✔
172

173
        rootDataverseName = findRootDataverseCached().getName();
1✔
174
    }
1✔
175

176
    @PreDestroy
177
    public void close() {
178
        if (solrServer != null) {
×
179
            try {
180
                solrServer.close();
×
181
            } catch (IOException e) {
×
182
                logger.warning("Solr closing error: " + e);
×
183
            }
×
184
            solrServer = null;
×
185
        }
186
    }
×
187
   
188
    @TransactionAttribute(REQUIRES_NEW)
189
    public Future<String> indexDataverseInNewTransaction(Dataverse dataverse) throws SolrServerException, IOException{
190
        return indexDataverse(dataverse, false);
×
191
    }
192
    
193
    public Future<String> indexDataverse(Dataverse dataverse) throws SolrServerException, IOException {
194
       return  indexDataverse(dataverse, true);
×
195
    }
196

197
    public Future<String> indexDataverse(Dataverse dataverse, boolean processPaths) throws SolrServerException, IOException {
198
        logger.fine("indexDataverse called on dataverse id " + dataverse.getId() + "(" + dataverse.getAlias() + ")");
×
199
        if (dataverse.getId() == null) {
×
200
            // TODO: Investigate the root cause of this "unable to index dataverse"
201
            // error showing up in the logs. Try running the API test suite?
202
            String msg = "unable to index dataverse. id was null (alias: " + dataverse.getAlias() + ")";
×
203
            logger.info(msg);
×
204
            return new AsyncResult<>(msg);
×
205
        }
206
        Dataverse rootDataverse = findRootDataverseCached();
×
207
        if (rootDataverse == null) {
×
208
            String msg = "Could not find root dataverse and the root dataverse should not be indexed. Returning.";
×
209
            return new AsyncResult<>(msg);
×
210
        } else if (dataverse.getId() == rootDataverse.getId()) {
×
211
            String msg = "The root dataverse should not be indexed. Returning.";
×
212
            return new AsyncResult<>(msg);
×
213
        }
214
        Collection<SolrInputDocument> docs = new ArrayList<>();
×
215
        SolrInputDocument solrInputDocument = new SolrInputDocument();
×
216
        solrInputDocument.addField(SearchFields.ID, solrDocIdentifierDataverse + dataverse.getId());
×
217
        solrInputDocument.addField(SearchFields.ENTITY_ID, dataverse.getId());
×
218
        solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, systemConfig.getVersion());
×
219
        solrInputDocument.addField(SearchFields.IDENTIFIER, dataverse.getAlias());
×
220
        solrInputDocument.addField(SearchFields.TYPE, "dataverses");
×
221
        solrInputDocument.addField(SearchFields.NAME, dataverse.getName());
×
222
        solrInputDocument.addField(SearchFields.NAME_SORT, dataverse.getName());
×
223
        solrInputDocument.addField(SearchFields.DATAVERSE_NAME, dataverse.getName());
×
224
        solrInputDocument.addField(SearchFields.DATAVERSE_ALIAS, dataverse.getAlias());
×
225
        solrInputDocument.addField(SearchFields.DATAVERSE_CATEGORY, dataverse.getIndexableCategoryName());
×
226
        if (dataverse.isReleased()) {
×
227
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
×
228
            if (FeatureFlags.ADD_PUBLICOBJECT_SOLR_FIELD.enabled()) {
×
229
                solrInputDocument.addField(SearchFields.PUBLIC_OBJECT, true);
×
230
            }
231
            solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getPublicationDate());
×
232
        } else {
233
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
×
234
            solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getCreateDate());
×
235
        }
236
        /* We don't really have harvested dataverses yet; 
237
           (I have in fact just removed the isHarvested() method from the Dataverse object) -- L.A.
238
        if (dataverse.isHarvested()) {
239
            solrInputDocument.addField(SearchFields.IS_HARVESTED, true);
240
            solrInputDocument.addField(SearchFields.SOURCE, HARVESTED);
241
        } else { (this means that all dataverses are "local" - should this be removed? */
242
        solrInputDocument.addField(SearchFields.IS_HARVESTED, false);
×
243
        solrInputDocument.addField(SearchFields.METADATA_SOURCE, rootDataverse.getName()); //rootDataverseName);
×
244
        /*}*/
245

246
        addDataverseReleaseDateToSolrDoc(solrInputDocument, dataverse);
×
247
        // if (dataverse.getOwner() != null) {
248
        // solrInputDocument.addField(SearchFields.HOST_DATAVERSE,
249
        // dataverse.getOwner().getName());
250
        // }
251
        solrInputDocument.addField(SearchFields.DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
×
252
        solrInputDocument.addField(SearchFields.DATAVERSE_DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
×
253
        // logger.info("dataverse affiliation: " + dataverse.getAffiliation());
254
        if (dataverse.getAffiliation() != null && !dataverse.getAffiliation().isEmpty()) {
×
255
            /**
256
             * @todo: stop using affiliation as category
257
             */
258
            // solrInputDocument.addField(SearchFields.CATEGORY,
259
            // dataverse.getAffiliation());
260
            solrInputDocument.addField(SearchFields.AFFILIATION, dataverse.getAffiliation());
×
261
            solrInputDocument.addField(SearchFields.DATAVERSE_AFFILIATION, dataverse.getAffiliation());
×
262
        }
263
        Set<String> langs = settingsService.getConfiguredLanguages();
×
264
        for (ControlledVocabularyValue dataverseSubject : dataverse.getDataverseSubjects()) {
×
265
            String subject = dataverseSubject.getStrValue();
×
266
            if (!subject.equals(DatasetField.NA_VALUE)) {
×
267
             // Index in all used languages (display and metadata languages
268
                for(String locale: langs) {
×
269
                    solrInputDocument.addField(SearchFields.DATAVERSE_SUBJECT, dataverseSubject.getLocaleStrValue(locale));
×
270
                }
×
271
                if (langs.isEmpty()) {
×
272
                    solrInputDocument.addField(SearchFields.DATAVERSE_SUBJECT, dataverseSubject.getStrValue());
×
273
                }
274

275
                // collapse into shared "subject" field used as a facet
276
                solrInputDocument.addField(SearchFields.SUBJECT, subject);
×
277
            }
278
        }
×
279
        // checking for NPE is important so we can create the root dataverse
280
        if (rootDataverse != null && !dataverse.equals(rootDataverse)) {
×
281
            // important when creating root dataverse
282
            if (dataverse.getOwner() != null) {
×
283
                solrInputDocument.addField(SearchFields.PARENT_ID, dataverse.getOwner().getId());
×
284
                solrInputDocument.addField(SearchFields.PARENT_NAME, dataverse.getOwner().getName());
×
285
                solrInputDocument.addField(SearchFields.DATAVERSE_PARENT_ALIAS, dataverse.getOwner().getAlias());
×
286
            }
287
        }
288
        List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
×
289
        List<String> dataverseSegments = findPathSegments(dataverse, dataversePathSegmentsAccumulator);
×
290
        List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
×
291
        if (dataversePaths.size() > 0) {
×
292
            // removing the dataverse's own id from the paths
293
            dataversePaths.remove(dataversePaths.size() - 1);
×
294
        }
295

296
        //Add paths for my linking dataverses
297
        List<Dataverse> linkingDataverses = findAllLinkingDataverses(dataverse);
×
298
        List<String> linkingDataversePaths = findLinkingDataversePaths(linkingDataverses);
×
299
        for (String dvPath:linkingDataversePaths ){
×
300
            dataversePaths.add(dvPath);
×
301
        }
×
302
        //only do this if we're indexing an individual dataverse ie not full re-index
303
        List<Long> dataverseChildrenIds = new ArrayList();
×
304
        List<Long> datasetChildrenIds = new ArrayList();
×
305
        if (processPaths) {
×
306
            //Get Linking Dataverses to see if I need to reindex my children
307
            if (hasAnyLinkingDataverses(dataverse)) {
×
308
                dataverseChildrenIds = dataverseService.findAllDataverseDataverseChildren(dataverse.getId());
×
309
                datasetChildrenIds = dataverseService.findAllDataverseDatasetChildren(dataverse.getId());
×
310
                for (Long id : datasetChildrenIds) {
×
311
                    updatePathForExistingSolrDocs(datasetService.find(id));
×
312
                }
×
313

314
                for (Long id : dataverseChildrenIds) {
×
315
                    updatePathForExistingSolrDocs(dataverseService.find(id));
×
316
                }
×
317
            }
318
        }
319
        
320
        solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
×
321
        docs.add(solrInputDocument);
×
322

323
        String status;
324
        try {
325
            if (dataverse.getId() != null) {
×
326
                solrClientService.getSolrClient().add(docs);
×
327
            } else {
328
                logger.info("WARNING: indexing of a dataverse with no id attempted");
×
329
            }
330
        } catch (SolrServerException | IOException ex) {
×
331
            status = ex.toString();
×
332
            logger.info(status);
×
333
            return new AsyncResult<>(status);
×
334
        }
×
335
        dvObjectService.updateContentIndexTime(dataverse);
×
336
        IndexResponse indexResponse = solrIndexService.indexPermissionsForOneDvObject(dataverse);
×
337
        String msg = "indexed dataverse " + dataverse.getId() + ":" + dataverse.getAlias() + ". Response from permission indexing: " + indexResponse.getMessage();
×
338
        return new AsyncResult<>(msg);
×
339

340
    }
341
    
342
    @TransactionAttribute(REQUIRES_NEW)
343
    public void indexDatasetInNewTransaction(Long datasetId) { //Dataset dataset) {
344
        boolean doNormalSolrDocCleanUp = false;
×
345
        Dataset dataset = datasetService.findDeep(datasetId);
×
346
        asyncIndexDataset(dataset, doNormalSolrDocCleanUp);
×
347
        dataset = null;
×
348
    }
×
349
    
350
    // The following two variables are only used in the synchronized getNextToIndex method and do not need to be synchronized themselves
351

352
    // nextToIndex contains datasets mapped by dataset id that were added for future indexing while the indexing was already ongoing for a given dataset
353
    // (if there already was a dataset scheduled for indexing, it is overwritten and only the most recently requested version is kept in the map)
354
    private static final Map<Long, Dataset> NEXT_TO_INDEX = new ConcurrentHashMap<>();
1✔
355
    // indexingNow is a set of dataset ids of datasets being indexed asynchronously right now
356
    private static final Map<Long, Boolean> INDEXING_NOW = new ConcurrentHashMap<>();
1✔
357
    // semaphore for async indexing
358
    private static final Semaphore ASYNC_INDEX_SEMAPHORE = new Semaphore(JvmSettings.MAX_ASYNC_INDEXES.lookupOptional(Integer.class).orElse(4), true);
1✔
359
    
360
    @Inject
361
    @Metric(name = "index_permit_wait_time", absolute = true, unit = MetricUnits.NANOSECONDS,
362
            description = "Displays how long does it take to receive a permit to index a dataset")
363
    Timer indexPermitWaitTimer;
364
    
365
    @Inject
366
    @Metric(name = "index_time", absolute = true, unit = MetricUnits.NANOSECONDS,
367
            description = "Displays how long does it take to index a dataset")
368
    Timer indexTimer;
369
    
370
    /**
371
     * Try to acquire a permit from the semaphore avoiding too many parallel indexes, potentially overwhelming Solr.
372
     * This method will time the duration waiting for the permit, allowing indexing performance to be measured.
373
     * @throws InterruptedException
374
     */
375
    private void acquirePermitFromSemaphore() throws InterruptedException {
376
        try (var timeContext = indexPermitWaitTimer.time()) {
×
377
            ASYNC_INDEX_SEMAPHORE.acquire();
×
378
        }
379
    }
×
380

381
    // When you pass null as Dataset parameter to this method, it indicates that the indexing of the dataset with "id" has finished
382
    // Pass non-null Dataset to schedule it for indexing
383
    synchronized private static Dataset getNextToIndex(Long id, Dataset d) {
384
        if (d == null) { // -> indexing of the dataset with id has finished
×
385
            Dataset next = NEXT_TO_INDEX.remove(id);
×
386
            if (next == null) { // -> no new indexing jobs were requested while indexing was ongoing
×
387
                // the job can be stopped now
388
                INDEXING_NOW.remove(id);
×
389
            }
390
            return next;
×
391
        }
392
        // index job is requested for a non-null dataset
393
        if (INDEXING_NOW.containsKey(id)) { // -> indexing job is already ongoing, and a new job should not be started by the current thread -> return null
×
394
            NEXT_TO_INDEX.put(id, d);
×
395
            return null;
×
396
        }
397
        // otherwise, start a new job
398
        INDEXING_NOW.put(id, true);
×
399
        return d;
×
400
    }
401

402
    /**
403
     * Indexes a dataset asynchronously.
404
     * 
405
     * Note that this method implement a synchronized skipping mechanism. When an
406
     * indexing job is already running for a given dataset in the background, the
407
     * new call will not index that dataset, but will delegate the execution to
408
     * the already running job. The running job will pick up the requested indexing
409
     * once that it is finished with the ongoing indexing. If another indexing is
410
     * requested before the ongoing indexing is finished, only the indexing that is
411
     * requested most recently will be picked up for the next indexing.
412
     * 
413
     * In other words: we can have at most one indexing ongoing for the given
414
     * dataset, and at most one (most recent) request for reindexing of the same
415
     * dataset. All requests that come between the most recent one and the ongoing
416
     * one are skipped for the optimization reasons. For a more in depth discussion,
417
     * see the pull request: https://github.com/IQSS/dataverse/pull/9558
418
     * 
419
     * @param dataset                The dataset to be indexed.
420
     * @param doNormalSolrDocCleanUp Flag for normal Solr doc clean up.
421
     */
422
    @Asynchronous
423
    public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
424
        try {
425
            acquirePermitFromSemaphore();
×
426
            doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp);
×
427
        } catch (InterruptedException e) {
×
428
            String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString();
×
429
            failureLogText += "\r\n" + e.getLocalizedMessage();
×
430
            LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset);
×
431
        } finally {
432
            ASYNC_INDEX_SEMAPHORE.release();
×
433
        }
434
    }
×
435

436
    private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
437
        Long id = dataset.getId();
×
438
        Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes)
×
439
        while (next != null) {
×
440
            // Time context will automatically start on creation and stop when leaving the try block
441
            try (var timeContext = indexTimer.time()) {
×
442
                indexDataset(next, doNormalSolrDocCleanUp);
×
443
            } catch (Exception e) { // catch all possible exceptions; otherwise when something unexpected happes the dataset wold remain locked and impossible to reindex
×
444
                String failureLogText = "Indexing failed. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString();
×
445
                failureLogText += "\r\n" + e.getLocalizedMessage();
×
446
                LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset);
×
447
            }
×
448
            next = getNextToIndex(id, null); // if dataset was not changed during the indexing (and no new job was requested), next is null and loop can be stopped
×
449
        }
450
    }
×
451

452
    @Asynchronous
453
    public void asyncIndexDatasetList(List<Dataset> datasets, boolean doNormalSolrDocCleanUp) {
454
        for(Dataset dataset : datasets) {
×
455
            try {
456
                acquirePermitFromSemaphore();
×
457
                doAsyncIndexDataset(dataset, true);
×
458
            } catch (InterruptedException e) {
×
459
                String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString();
×
460
                failureLogText += "\r\n" + e.getLocalizedMessage();
×
461
                LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset);
×
462
            } finally {
463
                ASYNC_INDEX_SEMAPHORE.release();
×
464
            }
465
        }
×
466
    }
×
467
    
468
    public void indexDvObject(DvObject objectIn) throws  SolrServerException, IOException {
469
        if (objectIn.isInstanceofDataset() ){
×
470
            asyncIndexDataset((Dataset)objectIn, true);
×
471
        } else if (objectIn.isInstanceofDataverse() ){
×
472
            indexDataverse((Dataverse)objectIn);
×
473
        }
474
    }
×
475

476
    public void indexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws  SolrServerException, IOException {
477
        doIndexDataset(dataset, doNormalSolrDocCleanUp);
×
478
        updateLastIndexedTime(dataset.getId());
×
479
    }
×
480
    
481
    private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws  SolrServerException, IOException {
482
        logger.fine("indexing dataset " + dataset.getId());
×
483
        /**
484
         * @todo should we use solrDocIdentifierDataset or
485
         * IndexableObject.IndexableTypes.DATASET.getName() + "_" ?
486
         */
487
        String solrIdPublished = determinePublishedDatasetSolrDocId(dataset);
×
488
        String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix();
×
489
        String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset);
×
490
        StringBuilder debug = new StringBuilder();
×
491
        debug.append("\ndebug:\n");
×
492
        boolean reduceSolrDeletes = FeatureFlags.REDUCE_SOLR_DELETES.enabled();
×
493
        if (!reduceSolrDeletes) {
×
494
            int numPublishedVersions = 0;
×
495
            List<DatasetVersion> versions = dataset.getVersions();
×
496
            List<String> solrIdsOfFilesToDelete = new ArrayList<>();
×
497
            for (DatasetVersion datasetVersion : versions) {
×
498
                Long versionDatabaseId = datasetVersion.getId();
×
499
                String versionTitle = datasetVersion.getTitle();
×
500
                String semanticVersion = datasetVersion.getSemanticVersion();
×
501
                DatasetVersion.VersionState versionState = datasetVersion.getVersionState();
×
502
                if (versionState.equals(DatasetVersion.VersionState.RELEASED)) {
×
503
                    numPublishedVersions += 1;
×
504
                }
505
                debug.append("version found with database id " + versionDatabaseId + "\n");
×
506
                debug.append("- title: " + versionTitle + "\n");
×
507
                debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n");
×
508
                List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
×
509
                List<String> fileInfo = new ArrayList<>();
×
510
                for (FileMetadata fileMetadata : fileMetadatas) {
×
511
                    String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId();
×
512
                    /**
513
                     * It sounds weird but the first thing we'll do is preemptively
514
                     * delete the Solr documents of all published files. Don't
515
                     * worry, published files will be re-indexed later along with
516
                     * the dataset. We do this so users can delete files from
517
                     * published versions of datasets and then re-publish a new
518
                     * version without fear that their old published files (now
519
                     * deleted from the latest published version) will be
520
                     * searchable. See also
521
                     * https://github.com/IQSS/dataverse/issues/762
522
                     */
523
                    solrIdsOfFilesToDelete.add(solrIdOfPublishedFile);
×
524
                    fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel());
×
525
                }
×
526
                try {
527
                    /**
528
                     * Preemptively delete *all* Solr documents for files associated
529
                     * with the dataset based on a Solr query.
530
                     *
531
                     * We must query Solr for this information because the file has
532
                     * been deleted from the database ( perhaps when Solr was down,
533
                     * as reported in https://github.com/IQSS/dataverse/issues/2086
534
                     * ) so the database doesn't even know about the file. It's an
535
                     * orphan.
536
                     *
537
                     * @todo This Solr query should make the iteration above based
538
                     * on the database unnecessary because it the Solr query should
539
                     * find all files for the dataset. We can probably remove the
540
                     * iteration above after an "index all" has been performed.
541
                     * Without an "index all" we won't be able to find files based
542
                     * on parentId because that field wasn't searchable in 4.0.
543
                     *
544
                     * @todo We should also delete the corresponding Solr
545
                     * "permission" documents for the files.
546
                     */
547
                    List<String> allFilesForDataset = findFilesOfParentDataset(dataset.getId());
×
548
                    solrIdsOfFilesToDelete.addAll(allFilesForDataset);
×
549
                } catch (SearchException | NullPointerException ex) {
×
550
                    logger.fine("could not run search of files to delete: " + ex);
×
551
                }
×
552
                int numFiles = 0;
×
553
                if (fileMetadatas != null) {
×
554
                    numFiles = fileMetadatas.size();
×
555
                }
556
                debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n");
×
557
            }
×
558
            debug.append("numPublishedVersions: " + numPublishedVersions + "\n");
×
559
            if (doNormalSolrDocCleanUp) {
×
560
                IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete);
×
561
                debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n");
×
562
            }
563
        }
564
        DatasetVersion latestVersion = dataset.getLatestVersion();
×
565
        DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState();
×
566
        String latestVersionStateString = latestVersionState.name();
×
567
        DatasetVersion releasedVersion = dataset.getReleasedVersion();
×
568
        boolean atLeastOnePublishedVersion = false;
×
569
        if (releasedVersion != null) {
×
570
            atLeastOnePublishedVersion = true;
×
571
        }
572
        if (reduceSolrDeletes) {
×
573
            List<String> solrIdsOfDocsToDelete = null;
×
574
            if (logger.isLoggable(Level.FINE)) {
×
575
                writeDebugInfo(debug, dataset);
×
576
            }
577
            if (doNormalSolrDocCleanUp) {
×
578
                try {
579
                    solrIdsOfDocsToDelete = findFilesOfParentDataset(dataset.getId());
×
580
                    logger.fine("Existing file docs: " + String.join(", ", solrIdsOfDocsToDelete));
×
581
                    if (!solrIdsOfDocsToDelete.isEmpty()) {
×
582
                        // We keep the latest version's docs unless it is deaccessioned and there is no
583
                        // published/released version
584
                        // So skip the loop removing those docs from the delete list except in that case
585
                        if ((!latestVersion.isDeaccessioned() || atLeastOnePublishedVersion)) {
×
586
                            List<FileMetadata> latestFileMetadatas = latestVersion.getFileMetadatas();
×
587
                            String suffix = (new IndexableDataset(latestVersion)).getDatasetState().getSuffix();
×
588
                            for (FileMetadata fileMetadata : latestFileMetadatas) {
×
589
                                String solrIdOfPublishedFile = solrDocIdentifierFile
×
590
                                        + fileMetadata.getDataFile().getId() + suffix;
×
591
                                solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile);
×
592
                            }
×
593
                        }
594
                        if (releasedVersion != null && !releasedVersion.equals(latestVersion)) {
×
595
                            List<FileMetadata> releasedFileMetadatas = releasedVersion.getFileMetadatas();
×
596
                            for (FileMetadata fileMetadata : releasedFileMetadatas) {
×
597
                                String solrIdOfPublishedFile = solrDocIdentifierFile
×
598
                                        + fileMetadata.getDataFile().getId();
×
599
                                solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile);
×
600
                            }
×
601
                        }
602
                    }
603
                    // Clear any unused dataset docs
604
                    if (!latestVersion.isDraft()) {
×
605
                        // The latest version is released, so should delete any draft docs for the
606
                        // dataset
607
                        solrIdsOfDocsToDelete.add(solrIdDraftDataset);
×
608
                    }
609
                    if (!atLeastOnePublishedVersion) {
×
610
                        // There's no released version, so should delete any normal state docs for the
611
                        // dataset
612
                        solrIdsOfDocsToDelete.add(solrIdPublished);
×
613
                    }
614
                    if (atLeastOnePublishedVersion || !latestVersion.isDeaccessioned()) {
×
615
                        // There's a released version or a draft, so should delete any deaccessioned
616
                        // state docs for the dataset
617
                        solrIdsOfDocsToDelete.add(solrIdDeaccessioned);
×
618
                    }
619
                } catch (SearchException | NullPointerException ex) {
×
620
                    logger.fine("could not run search of files to delete: " + ex);
×
621
                }
×
622
                logger.fine("Solr docs to delete: " + String.join(", ", solrIdsOfDocsToDelete));
×
623

624
                if (!solrIdsOfDocsToDelete.isEmpty()) {
×
625
                    List<String> solrIdsOfPermissionDocsToDelete = new ArrayList<>();
×
626
                    for (String file : solrIdsOfDocsToDelete) {
×
627
                        // Also remove associated permission docs
628
                        solrIdsOfPermissionDocsToDelete.add(file + discoverabilityPermissionSuffix);
×
629
                    }
×
630
                    solrIdsOfDocsToDelete.addAll(solrIdsOfPermissionDocsToDelete);
×
631
                    logger.fine("Solr docs and perm docs to delete: " + String.join(", ", solrIdsOfDocsToDelete));
×
632

633
                    IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService
×
634
                            .deleteMultipleSolrIds(solrIdsOfDocsToDelete);
×
635
                    debug.append("result of attempt to premptively deleted published files before reindexing: "
×
636
                            + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n");
637
                }
638
            }
639
        }
640
       
641
        Map<DatasetVersion.VersionState, Boolean> desiredCards = new LinkedHashMap<>();
×
642
        /**
643
         * @todo refactor all of this below and have a single method that takes
644
         * the map of desired cards (which correspond to Solr documents) as one
645
         * of the arguments and does all the operations necessary to achieve the
646
         * desired state.
647
         */
648
        StringBuilder results = new StringBuilder();
×
649
        if (atLeastOnePublishedVersion == false) {
×
650
            results.append("No published version, nothing will be indexed as ")
×
651
                    .append(solrIdPublished).append("\n");
×
652
            if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
×
653

654
                desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
×
655
                IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
×
656
                String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
×
657
                results.append("The latest version is a working copy (latestVersionState: ")
×
658
                        .append(latestVersionStateString).append(") and indexing was attempted for ")
×
659
                        .append(solrIdDraftDataset).append(" (limited discoverability). Result: ")
×
660
                        .append(indexDraftResult).append("\n");
×
661

662
                desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
×
663
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
664
                    String deleteDeaccessionedResult = removeDeaccessioned(dataset);
×
665
                    results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ")
×
666
                            .append(solrIdDeaccessioned).append(" (and files). Result: ")
×
667
                            .append(deleteDeaccessionedResult).append("\n");
×
668
                }
669

670
                desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
×
671
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
672
                    String deletePublishedResults = removePublished(dataset);
×
673
                    results.append("No published version. Attempting to delete traces of published version from index. Result: ")
×
674
                            .append(deletePublishedResults).append("\n");
×
675
                }
676

677
                /**
678
                 * Desired state for existence of cards: {DRAFT=true,
679
                 * DEACCESSIONED=false, RELEASED=false}
680
                 *
681
                 * No published version, nothing will be indexed as dataset_17
682
                 *
683
                 * The latest version is a working copy (latestVersionState:
684
                 * DRAFT) and indexing was attempted for dataset_17_draft
685
                 * (limited discoverability). Result: indexed dataset 17 as
686
                 * dataset_17_draft. filesIndexed: [datafile_18_draft]
687
                 *
688
                 * Draft exists, no need for deaccessioned version. Deletion
689
                 * attempted for dataset_17_deaccessioned (and files). Result:
690
                 * Attempted to delete dataset_17_deaccessioned from Solr index.
691
                 * updateReponse was:
692
                 * {responseHeader={status=0,QTime=1}}Attempted to delete
693
                 * datafile_18_deaccessioned from Solr index. updateReponse was:
694
                 * {responseHeader={status=0,QTime=1}}
695
                 *
696
                 * No published version. Attempting to delete traces of
697
                 * published version from index. Result: Attempted to delete
698
                 * dataset_17 from Solr index. updateReponse was:
699
                 * {responseHeader={status=0,QTime=1}}Attempted to delete
700
                 * datafile_18 from Solr index. updateReponse was:
701
                 * {responseHeader={status=0,QTime=0}}
702
                 */
703
                String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
×
704
                logger.fine(result);
×
705
                indexDatasetPermissions(dataset);
×
706
            } else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
×
707

708
                desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true);
×
709
                IndexableDataset indexableDeaccessionedVersion = new IndexableDataset(latestVersion);
×
710
                String indexDeaccessionedVersionResult = addOrUpdateDataset(indexableDeaccessionedVersion);
×
711
                results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n");
×
712

713
                desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
×
714
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
715
                    String deletePublishedResults = removePublished(dataset);
×
716
                    results.append("No published version. Attempting to delete traces of published version from index. Result: ").append(deletePublishedResults).append("\n");
×
717
                }
718

719
                desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
×
720
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
721
                    List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
×
722
                    String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
×
723
                    String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
×
724
                    results.append("Attempting to delete traces of drafts. Result: ")
×
725
                            .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
×
726
                }
727

728
                /**
729
                 * Desired state for existence of cards: {DEACCESSIONED=true,
730
                 * RELEASED=false, DRAFT=false}
731
                 *
732
                 * No published version, nothing will be indexed as dataset_17
733
                 *
734
                 * No draft version. Attempting to index as deaccessioned.
735
                 * Result: indexed dataset 17 as dataset_17_deaccessioned.
736
                 * filesIndexed: []
737
                 *
738
                 * No published version. Attempting to delete traces of
739
                 * published version from index. Result: Attempted to delete
740
                 * dataset_17 from Solr index. updateReponse was:
741
                 * {responseHeader={status=0,QTime=0}}Attempted to delete
742
                 * datafile_18 from Solr index. updateReponse was:
743
                 * {responseHeader={status=0,QTime=3}}
744
                 *
745
                 * Attempting to delete traces of drafts. Result: Attempted to
746
                 * delete dataset_17_draft from Solr index. updateReponse was:
747
                 * {responseHeader={status=0,QTime=1}}
748
                 */
749
                String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
×
750
                logger.fine(result);
×
751
                indexDatasetPermissions(dataset);
×
752
            } else {
×
753
                String result = "No-op. Unexpected condition reached: No released version and latest version is neither draft nor deaccessioned";
×
754
                logger.fine(result);
×
755
            }
×
756
        } else if (atLeastOnePublishedVersion == true) {
×
757
            results.append("Published versions found. ")
×
758
                    .append("Will attempt to index as ").append(solrIdPublished).append(" (discoverable by anonymous)\n");
×
759
            if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED)
×
760
                    || latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
×
761

762
                desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
×
763
                IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
×
764
                String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion);
×
765
                results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n");
×
766

767
                desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
×
768
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
769
                    List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
×
770
                    String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
×
771
                    String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
×
772
                    results.append("The latest version is published. Attempting to delete drafts. Result: ")
×
773
                            .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
×
774
                }
775

776
                desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
×
777
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
778
                    String deleteDeaccessionedResult = removeDeaccessioned(dataset);
×
779
                    results.append("No need for deaccessioned version. Deletion attempted for ")
×
780
                            .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
×
781
                }
782

783
                /**
784
                 * Desired state for existence of cards: {RELEASED=true,
785
                 * DRAFT=false, DEACCESSIONED=false}
786
                 *
787
                 * Released versions found: 1. Will attempt to index as
788
                 * dataset_17 (discoverable by anonymous)
789
                 *
790
                 * Attempted to index dataset_17. Result: indexed dataset 17 as
791
                 * dataset_17. filesIndexed: [datafile_18]
792
                 *
793
                 * The latest version is published. Attempting to delete drafts.
794
                 * Result: Attempted to delete dataset_17_draft from Solr index.
795
                 * updateReponse was: {responseHeader={status=0,QTime=1}}
796
                 *
797
                 * No need for deaccessioned version. Deletion attempted for
798
                 * dataset_17_deaccessioned. Result: Attempted to delete
799
                 * dataset_17_deaccessioned from Solr index. updateReponse was:
800
                 * {responseHeader={status=0,QTime=1}}Attempted to delete
801
                 * datafile_18_deaccessioned from Solr index. updateReponse was:
802
                 * {responseHeader={status=0,QTime=0}}
803
                 */
804
                String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
×
805
                logger.fine(result);
×
806
                indexDatasetPermissions(dataset);
×
807
            } else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
×
808

809
                IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
×
810
                desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
×
811
                Set<Long> datafilesInDraftVersion = new HashSet<>();
×
812
                for (FileMetadata fm : latestVersion.getFileMetadatas()) {
×
813
                    datafilesInDraftVersion.add(fm.getDataFile().getId());
×
814
                }
×
815

816

817
                desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
×
818
                IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
×
819
                String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion, datafilesInDraftVersion);
×
820
                results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n");
×
821

822
                String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
×
823
                results.append("The latest version is a working copy (latestVersionState: ")
×
824
                        .append(latestVersionStateString).append(") and will be indexed as ")
×
825
                        .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n");
×
826
                
827
                desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
×
828
                if (!reduceSolrDeletes && doNormalSolrDocCleanUp) {
×
829
                    String deleteDeaccessionedResult = removeDeaccessioned(dataset);
×
830
                    results.append("No need for deaccessioned version. Deletion attempted for ")
×
831
                            .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
×
832
                }
833

834
                /**
835
                 * Desired state for existence of cards: {DRAFT=true,
836
                 * RELEASED=true, DEACCESSIONED=false}
837
                 *
838
                 * Released versions found: 1. Will attempt to index as
839
                 * dataset_17 (discoverable by anonymous)
840
                 *
841
                 * The latest version is a working copy (latestVersionState:
842
                 * DRAFT) and will be indexed as dataset_17_draft (limited
843
                 * visibility). Result: indexed dataset 17 as dataset_17_draft.
844
                 * filesIndexed: [datafile_18_draft]
845
                 *
846
                 * There is a published version we will attempt to index.
847
                 * Result: indexed dataset 17 as dataset_17. filesIndexed:
848
                 * [datafile_18]
849
                 *
850
                 * No need for deaccessioned version. Deletion attempted for
851
                 * dataset_17_deaccessioned. Result: Attempted to delete
852
                 * dataset_17_deaccessioned from Solr index. updateReponse was:
853
                 * {responseHeader={status=0,QTime=1}}Attempted to delete
854
                 * datafile_18_deaccessioned from Solr index. updateReponse was:
855
                 * {responseHeader={status=0,QTime=0}}
856
                 */
857
                String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
×
858
                logger.fine(result);
×
859
                indexDatasetPermissions(dataset);
×
860
            } else {
×
861
                String result = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft";
×
862
                logger.fine(result);
×
863
            }
×
864
        } else {
865
            String result = "No-op. Unexpected condition reached: Has a version been published or not?";
×
866
            logger.fine(result);
×
867
        }
868
    }
×
869
    
870
    private void writeDebugInfo(StringBuilder debug, Dataset dataset) {
871
        List<DatasetVersion> versions = dataset.getVersions();
×
872
        int numPublishedVersions = 0;
×
873
        for (DatasetVersion datasetVersion : versions) {
×
874
            Long versionDatabaseId = datasetVersion.getId();
×
875
            String versionTitle = datasetVersion.getTitle();
×
876
            String semanticVersion = datasetVersion.getSemanticVersion();
×
877
            DatasetVersion.VersionState versionState = datasetVersion.getVersionState();
×
878
            if (versionState.equals(DatasetVersion.VersionState.RELEASED)) {
×
879
                numPublishedVersions += 1;
×
880
            }
881
            debug.append("version found with database id " + versionDatabaseId + "\n");
×
882
            debug.append("- title: " + versionTitle + "\n");
×
883
            debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n");
×
884
            List<String> fileInfo = new ArrayList<>();
×
885
            List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
×
886

887
            for (FileMetadata fileMetadata : fileMetadatas) {
×
888
                /**
889
                 * It sounds weird but the first thing we'll do is preemptively delete the Solr
890
                 * documents of all published files. Don't worry, published files will be
891
                 * re-indexed later along with the dataset. We do this so users can delete files
892
                 * from published versions of datasets and then re-publish a new version without
893
                 * fear that their old published files (now deleted from the latest published
894
                 * version) will be searchable. See also
895
                 * https://github.com/IQSS/dataverse/issues/762
896
                 */
897
                fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel());
×
898
            }
×
899
            int numFiles = 0;
×
900
            if (fileMetadatas != null) {
×
901
                numFiles = fileMetadatas.size();
×
902
            }
903
            debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n");
×
904
        }
×
905
        debug.append("numPublishedVersions: " + numPublishedVersions + "\n");
×
906
    }
×
907

908
    private IndexResponse indexDatasetPermissions(Dataset dataset) {
909
        boolean disabledForDebugging = false;
×
910
        if (disabledForDebugging) {
×
911
            /**
912
             * Performance problems indexing permissions in
913
             * https://github.com/IQSS/dataverse/issues/50 and
914
             * https://github.com/IQSS/dataverse/issues/2036
915
             */
916
            return new IndexResponse("permissions indexing disabled for debugging");
×
917
        }
918
        IndexResponse indexResponse = solrIndexService.indexPermissionsOnSelfAndChildren(dataset);
×
919
        return indexResponse;
×
920
    }
921

922
    private String addOrUpdateDataset(IndexableDataset indexableDataset) throws  SolrServerException, IOException {
923
        String result = addOrUpdateDataset(indexableDataset, null);
×
924
        return result;
×
925
    }
926

927
    public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long> datafilesInDraftVersion) throws  SolrServerException, IOException {
928
        IndexableDataset.DatasetState state = indexableDataset.getDatasetState();
1✔
929
        Dataset dataset = indexableDataset.getDatasetVersion().getDataset();
1✔
930
        logger.fine("adding or updating Solr document for dataset id " + dataset.getId());
1✔
931
        Collection<SolrInputDocument> docs = new ArrayList<>();
1✔
932
        SolrInputDocument solrInputDocument = new SolrInputDocument();
1✔
933
        String datasetSolrDocId = indexableDataset.getSolrDocId();
1✔
934
        solrInputDocument.addField(SearchFields.ID, datasetSolrDocId);
1✔
935
        solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId());
1✔
936
        String dataverseVersion = systemConfig.getVersion();
1✔
937
        solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
1✔
938
        solrInputDocument.addField(SearchFields.IDENTIFIER, dataset.getGlobalId().toString());
1✔
939
        solrInputDocument.addField(SearchFields.DATASET_PERSISTENT_ID, dataset.getGlobalId().toString());
1✔
940
        solrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
1✔
941
        solrInputDocument.addField(SearchFields.TYPE, "datasets");
1✔
942
        solrInputDocument.addField(SearchFields.DATASET_VALID, indexableDataset.getDatasetVersion().isValid());
1✔
943

944
        final Dataverse dataverse = dataset.getDataverseContext();
1✔
945
        final String dvIndexableCategoryName = dataverse.getIndexableCategoryName();
1✔
946
        final String dvAlias = dataverse.getAlias();
1✔
947
        final String dvDisplayName = dataverse.getDisplayName();
1✔
948
        final String rdvName = findRootDataverseCached().getName();
1✔
949
        //This only grabs the immediate parent dataverse's category. We do the same for dataverses themselves.
950
        solrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dvIndexableCategoryName);
1✔
951
        solrInputDocument.addField(SearchFields.IDENTIFIER_OF_DATAVERSE, dvAlias);
1✔
952
        solrInputDocument.addField(SearchFields.DATAVERSE_NAME, dvDisplayName);
1✔
953
        
954
        Date datasetSortByDate = new Date();
1✔
955
        Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate();
1✔
956
        if (majorVersionReleaseDate != null) {
1✔
957
            if (true) {
958
                String msg = "major release date found: " + majorVersionReleaseDate.toString();
×
959
                logger.fine(msg);
×
960
            }
961
            datasetSortByDate = majorVersionReleaseDate;
×
962
        } else {
963
            if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) {
1✔
964
                solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
1✔
965
            } else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.DEACCESSIONED)) {
×
966
                solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DEACCESSIONED_STRING);
×
967
            }
968
            Date createDate = dataset.getCreateDate();
1✔
969
            if (createDate != null) {
1✔
970
                if (true) {
971
                    String msg = "can't find major release date, using create date: " + createDate;
×
972
                    logger.fine(msg);
×
973
                }
974
                datasetSortByDate = createDate;
×
975
            } else {
976
                String msg = "can't find major release date or create date, using \"now\"";
1✔
977
                logger.info(msg);
1✔
978
                datasetSortByDate = new Date();
1✔
979
            }
980
        }
981
        solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate);
1✔
982

983
        if (state.equals(DatasetState.PUBLISHED)) {
1✔
984
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
×
985
            if (FeatureFlags.ADD_PUBLICOBJECT_SOLR_FIELD.enabled()) {
×
986
                solrInputDocument.addField(SearchFields.PUBLIC_OBJECT, true);
×
987
            }
988
            // solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE,
989
            // dataset.getPublicationDate());
990
        } else if (state.equals(DatasetState.WORKING_COPY)) {
1✔
991
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
1✔
992
        }
993

994
        addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset);
1✔
995

996
        if (dataset.isHarvested()) {
1✔
997
            solrInputDocument.addField(SearchFields.IS_HARVESTED, true);
×
998
            if (FeatureFlags.INDEX_HARVESTED_METADATA_SOURCE.enabled()) {
×
999
                // New - as of 6.3 - option of indexing the actual origin of 
1000
                // harvested objects as the metadata source:
1001
                solrInputDocument.addField(SearchFields.METADATA_SOURCE,
×
1002
                                        dataset.getHarvestedFrom() != null ? dataset.getHarvestedFrom().getName() : HARVESTED);
×
1003
            } else {
1004
                solrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
×
1005
            }
1006
        } else {
1007
            solrInputDocument.addField(SearchFields.IS_HARVESTED, false);
1✔
1008
            solrInputDocument.addField(SearchFields.METADATA_SOURCE, rdvName); //rootDataverseName);
1✔
1009
        }
1010

1011
        DatasetType datasetType = dataset.getDatasetType();
1✔
1012
        solrInputDocument.addField(SearchFields.DATASET_TYPE, datasetType.getName());
1✔
1013

1014
        DatasetVersion datasetVersion = indexableDataset.getDatasetVersion();
1✔
1015
        String parentDatasetTitle = "TBD";
1✔
1016
        if (datasetVersion != null) {
1✔
1017

1018
            addLicenseToSolrDoc(solrInputDocument, datasetVersion);
1✔
1019

1020
            solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
1✔
1021
            solrInputDocument.addField(SearchFields.DATASET_CITATION, datasetVersion.getCitation(false));
1✔
1022
            solrInputDocument.addField(SearchFields.DATASET_CITATION_HTML, datasetVersion.getCitation(true));
1✔
1023

1024
            solrInputDocument.addField(SearchFields.FILE_COUNT, datasetVersionFilesServiceBean.getFileMetadataCount(datasetVersion));
1✔
1025

1026
            if (datasetVersion.isInReview()) {
1✔
1027
                solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
×
1028
            }
1029
            if(datasetVersion.getExternalStatusLabel()!=null) {
1✔
1030
                solrInputDocument.addField(SearchFields.EXTERNAL_STATUS, datasetVersion.getExternalStatusLabel());
×
1031
            }
1032

1033
            Set<String> langs = settingsService.getConfiguredLanguages();
1✔
1034
            Map<Long, JsonObject> cvocMap = datasetFieldService.getCVocConf(true);
1✔
1035
            Map<Long, Set<String>> cvocManagedFieldMap = new HashMap<>();
1✔
1036
            for (Map.Entry<Long, JsonObject> cvocEntry : cvocMap.entrySet()) {
1✔
1037
                if(cvocEntry.getValue().containsKey("managed-fields")) {
×
1038
                    JsonObject managedFields = cvocEntry.getValue().getJsonObject("managed-fields");
×
1039
                    Set<String> managedFieldValues = new HashSet<>();
×
1040
                    for (String s : managedFields.keySet()) {
×
1041
                        managedFieldValues.add(managedFields.getString(s));
×
1042
                    }
×
1043
                    cvocManagedFieldMap.put(cvocEntry.getKey(), managedFieldValues);
×
1044
                }
1045
            }
×
1046

1047

1048

1049
            Set<String> metadataBlocksWithValue = new HashSet<>();
1✔
1050
            for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) {
1✔
1051

1052
                DatasetFieldType dsfType = dsf.getDatasetFieldType();
1✔
1053
                String solrFieldSearchable = dsfType.getSolrField().getNameSearchable();
1✔
1054
                String solrFieldFacetable = dsfType.getSolrField().getNameFacetable();
1✔
1055

1056
                if (dsf.getValues() != null && !dsf.getValues().isEmpty() && dsf.getValues().get(0) != null && solrFieldSearchable != null) {
1✔
1057
                    // Index all metadata blocks that have a value - To show in new facet category SearchFields.METADATA_TYPES
1058
                    if (dsfType.getMetadataBlock() != null) {
1✔
1059
                        metadataBlocksWithValue.add(dsfType.getMetadataBlock().getName());
1✔
1060
                    }
1061

1062
                    logger.fine("indexing " + dsf.getDatasetFieldType().getName() + ":" + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable);
1✔
1063
                    // if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER))
1064
                    // {
1065
                    if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.EMAIL)) {
1✔
1066
                        // no-op. we want to keep email address out of Solr per
1067
                        // https://github.com/IQSS/dataverse/issues/759
1068
                    } else if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.DATE)) {
1✔
1069
                        String dateAsString = "";
×
1070
                        if (!dsf.getValues_nondisplay().isEmpty()) {
×
1071
                            dateAsString = dsf.getValues_nondisplay().get(0);
×
1072
                        }                      
1073
                        logger.fine("date as string: " + dateAsString);
×
1074
                        if (dateAsString != null && !dateAsString.isEmpty()) {
×
1075
                            SimpleDateFormat inputDateyyyy = new SimpleDateFormat("yyyy", Locale.ENGLISH);
×
1076
                            try {
1077
                                /**
1078
                                 * @todo when bean validation is working we
1079
                                 * won't have to convert strings into dates
1080
                                 */
1081
                                logger.fine("Trying to convert " + dateAsString + " to a YYYY date from dataset " + dataset.getId());
×
1082
                                Date dateAsDate = inputDateyyyy.parse(dateAsString);
×
1083
                                SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy");
×
1084
                                String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate);
×
1085
                                logger.fine("YYYY only: " + datasetFieldFlaggedAsDate);
×
1086
                                // solrInputDocument.addField(solrFieldSearchable,
1087
                                // Integer.parseInt(datasetFieldFlaggedAsDate));
1088
                                solrInputDocument.addField(solrFieldSearchable, datasetFieldFlaggedAsDate);
×
1089
                                if (dsfType.getSolrField().isFacetable()) {
×
1090
                                    // solrInputDocument.addField(solrFieldFacetable,
1091
                                    // Integer.parseInt(datasetFieldFlaggedAsDate));
1092
                                    solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate);
×
1093
                                }
1094
                            } catch (Exception ex) {
×
1095
                                logger.info("unable to convert " + dateAsString + " into YYYY format and couldn't index it (" + dsfType.getName() + ")");
×
1096
                            }
×
1097
                        }
1098
                    } else {
×
1099
                        // _s (dynamic string) and all other Solr fields
1100

1101
                        if (dsf.getDatasetFieldType().getName().equals("authorAffiliation")) {
1✔
1102
                            /**
1103
                             * @todo think about how to tie the fact that this
1104
                             * needs to be multivalued (_ss) because a
1105
                             * multivalued facet (authorAffilition_ss) is being
1106
                             * collapsed into here at index time. The business
1107
                             * logic to determine if a data-driven metadata
1108
                             * field should be indexed into Solr as a single or
1109
                             * multiple value lives in the getSolrField() method
1110
                             * of DatasetField.java
1111
                             */
1112
                            solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValuesWithoutNaValues());
×
1113
                        } else if (dsf.getDatasetFieldType().getName().equals("title")) {
1✔
1114
                            // datasets have titles not names but index title under name as well so we can
1115
                            // sort datasets by name along dataverses and files
1116
                            List<String> possibleTitles = dsf.getValues();
×
1117
                            String firstTitle = possibleTitles.get(0);
×
1118
                            if (firstTitle != null) {
×
1119
                                parentDatasetTitle = firstTitle;
×
1120
                            }
1121
                            solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues());
×
1122
                        }
1123

1124
                        // If there is a CVOCConf for the field
1125
                        if(cvocMap.containsKey(dsfType.getId())) {
1✔
1126
                            List<String> vals = dsf.getValues_nondisplay();
×
1127
                            Set<String> searchStrings = new HashSet<>();
×
1128
                            for (String val: vals) {
×
1129
                                searchStrings.add(val);
×
1130
                                // Try to get string values from externalvocabularyvalue using val as termUri
1131
                                searchStrings.addAll(datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), dsfType.getName()));
×
1132

1133
                                if(dsfType.getParentDatasetFieldType()!=null) {
×
1134
                                    List<DatasetField> childDatasetFields = dsf.getParentDatasetFieldCompoundValue().getChildDatasetFields();
×
1135
                                    for (DatasetField df : childDatasetFields) {
×
1136
                                        if(cvocManagedFieldMap.containsKey(dsfType.getId()) && cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) {
×
1137
                                            String solrManagedFieldSearchable = df.getDatasetFieldType().getSolrField().getNameSearchable();
×
1138
                                            // Try to get string values from externalvocabularyvalue but for a managed fields of the CVOCConf
1139
                                            Set<String> stringsForManagedField = datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), df.getDatasetFieldType().getName());
×
1140
                                            logger.fine(solrManagedFieldSearchable + " filled with externalvocabularyvalue : " + stringsForManagedField);
×
1141
                                            //.addField works as addition of value not a replace of value
1142
                                            // it allows to add mapped values by CVOCConf before or after indexing real DatasetField value(s) of solrManagedFieldSearchable
1143
                                            solrInputDocument.addField(solrManagedFieldSearchable, stringsForManagedField);
×
1144
                                        }
1145
                                    }
×
1146
                                }
1147
                            }
×
1148
                            logger.fine(solrFieldSearchable + " filled with externalvocabularyvalue : " + searchStrings);
×
1149
                            solrInputDocument.addField(solrFieldSearchable, searchStrings);
×
1150
                            if (dsfType.getSolrField().isFacetable()) {
×
1151
                                logger.fine(solrFieldFacetable + " gets " + vals);
×
1152
                                solrInputDocument.addField(solrFieldFacetable, vals);
×
1153
                            }
1154
                        } else if (dsfType.isControlledVocabulary()) {
1✔
1155
                            /** If the cvv list is empty but the dfv list is not then it is assumed this was harvested
1156
                             *  from an installation that had controlled vocabulary entries that don't exist in our this db
1157
                             * @see <a href="https://github.com/IQSS/dataverse/issues/9992">Feature Request/Idea: Harvest metadata values that aren't from a list of controlled values #9992</a>
1158
                             */
1159
                            if (dsf.getControlledVocabularyValues().isEmpty()) {
×
1160
                                for (DatasetFieldValue dfv : dsf.getDatasetFieldValues()) {
×
1161
                                    if (dfv.getValue() == null || dfv.getValue().equals(DatasetField.NA_VALUE)) {
×
1162
                                        continue;
×
1163
                                    }
1164
                                    solrInputDocument.addField(solrFieldSearchable, dfv.getValue());
×
1165

1166
                                    if (dsfType.getSolrField().isFacetable()) {
×
1167
                                        solrInputDocument.addField(solrFieldFacetable, dfv.getValue());
×
1168
                                    }
1169
                                }
×
1170
                            } else {
1171
                                for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
×
1172
                                    if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
×
1173
                                        continue;
×
1174
                                    }
1175

1176
                                    // Index in all used languages (display and metadata languages
1177
                                    if (!dsfType.isAllowMultiples() || langs.isEmpty()) {
×
1178
                                        solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
×
1179
                                    } else {
1180
                                        for(String locale: langs) {
×
1181
                                            solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale));
×
1182
                                        }
×
1183
                                    }
1184

1185
                                    if (dsfType.getSolrField().isFacetable()) {
×
1186
                                        solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
×
1187
                                    }
1188
                                }
×
1189
                            }
1190
                        } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
1✔
1191
                            // strip HTML
1192
                            List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValuesWithoutNaValues());
×
1193
                            solrInputDocument.addField(solrFieldSearchable, htmlFreeText);
×
1194
                            if (dsfType.getSolrField().isFacetable()) {
×
1195
                                solrInputDocument.addField(solrFieldFacetable, htmlFreeText);
×
1196
                            }
1197
                        } else {
×
1198
                            // do not strip HTML
1199
                            solrInputDocument.addField(solrFieldSearchable, dsf.getValuesWithoutNaValues());
1✔
1200
                            if (dsfType.getSolrField().isFacetable()) {
1✔
1201
                                if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.topicClassValue)) {
×
1202
                                    String topicClassificationTerm = getTopicClassificationTermOrTermAndVocabulary(dsf);
×
1203
                                    if (topicClassificationTerm != null) {
×
1204
                                        logger.fine(solrFieldFacetable + " gets " + topicClassificationTerm);
×
1205
                                        solrInputDocument.addField(solrFieldFacetable, topicClassificationTerm);
×
1206
                                    }
1207
                                } else {
×
1208
                                    solrInputDocument.addField(solrFieldFacetable, dsf.getValuesWithoutNaValues());
×
1209
                                }
1210
                            }
1211
                        }
1212
                    }
1213
                }
1214
                
1215
                //ToDo - define a geom/bbox type solr field and find those instead of just this one
1216
                if(dsfType.getName().equals(DatasetFieldConstant.geographicBoundingBox)) {
1✔
1217
                    String minWestLon=null;
1✔
1218
                    String maxEastLon=null;
1✔
1219
                    String maxNorthLat=null;
1✔
1220
                    String minSouthLat=null;
1✔
1221
                    for (DatasetFieldCompoundValue compoundValue : dsf.getDatasetFieldCompoundValues()) {
1✔
1222
                        String westLon=null;
1✔
1223
                        String eastLon=null;
1✔
1224
                        String northLat=null;
1✔
1225
                        String southLat=null;
1✔
1226
                        for(DatasetField childDsf: compoundValue.getChildDatasetFields()) {
1✔
1227
                            switch (childDsf.getDatasetFieldType().getName()) {
1✔
1228
                            case DatasetFieldConstant.westLongitude:
1229
                                westLon = childDsf.getRawValue();
1✔
1230
                                break;
1✔
1231
                            case DatasetFieldConstant.eastLongitude:
1232
                                eastLon = childDsf.getRawValue();
1✔
1233
                                break;
1✔
1234
                            case DatasetFieldConstant.northLatitude:
1235
                                northLat = childDsf.getRawValue();
1✔
1236
                                break;
1✔
1237
                            case DatasetFieldConstant.southLatitude:
1238
                                southLat = childDsf.getRawValue();
1✔
1239
                                break;
1240
                            }
1241
                        }
1✔
1242
                        if ((eastLon != null || westLon != null) && (northLat != null || southLat != null)) {
1✔
1243
                            // we have a point or a box, so proceed
1244
                            if (eastLon == null) {
1✔
1245
                                eastLon = westLon;
×
1246
                            } else if (westLon == null) {
1✔
1247
                                westLon = eastLon;
×
1248
                            }
1249
                            if (northLat == null) {
1✔
1250
                                northLat = southLat;
×
1251
                            } else if (southLat == null) {
1✔
1252
                                southLat = northLat;
×
1253
                            }
1254
                            //Find the overall bounding box that includes all bounding boxes
1255
                            if(minWestLon==null || Float.parseFloat(minWestLon) > Float.parseFloat(westLon)) {
1✔
1256
                                minWestLon=westLon;
1✔
1257
                            }
1258
                            if(maxEastLon==null || Float.parseFloat(maxEastLon) < Float.parseFloat(eastLon)) {
1✔
1259
                                maxEastLon=eastLon;
1✔
1260
                            }
1261
                            if(minSouthLat==null || Float.parseFloat(minSouthLat) > Float.parseFloat(southLat)) {
1✔
1262
                                minSouthLat=southLat;
1✔
1263
                            }
1264
                            if(maxNorthLat==null || Float.parseFloat(maxNorthLat) < Float.parseFloat(northLat)) {
1✔
1265
                                maxNorthLat=northLat;
1✔
1266
                            }
1267

1268
                            if (DatasetFieldValueValidator.validateBoundingBox(westLon, eastLon, northLat, southLat)) {
1✔
1269
                                //W, E, N, S
1270
                                solrInputDocument.addField(SearchFields.GEOLOCATION, "ENVELOPE(" + westLon + "," + eastLon + "," + northLat + "," + southLat + ")");
×
1271
                            }
1272
                        }
1273
                    }
1✔
1274
                    //Only one bbox per dataset
1275
                    //W, E, N, S
1276
                    if (DatasetFieldValueValidator.validateBoundingBox(minWestLon, maxEastLon, maxNorthLat, minSouthLat) &&
1✔
1277
                            (minWestLon != null || maxEastLon != null) && (maxNorthLat != null || minSouthLat != null)) {
1278
                        solrInputDocument.addField(SearchFields.BOUNDING_BOX, "ENVELOPE(" + minWestLon + "," + maxEastLon + "," + maxNorthLat + "," + minSouthLat + ")");
×
1279
                    }
1280

1281
                }
1282
            }
1✔
1283

1284
            for(String metadataBlockName : metadataBlocksWithValue) {
1✔
1285
                solrInputDocument.addField(SearchFields.METADATA_TYPES, metadataBlockName);
1✔
1286
            }
1✔
1287
        }
1288
        
1289
        List<String> dataversePaths = retrieveDVOPaths(dataset); 
1✔
1290
        solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
1✔
1291
        // solrInputDocument.addField(SearchFields.HOST_DATAVERSE,
1292
        // dataset.getOwner().getName());
1293
        solrInputDocument.addField(SearchFields.PARENT_ID, dataset.getOwner().getId());
1✔
1294
        solrInputDocument.addField(SearchFields.PARENT_NAME, dataset.getOwner().getName());
1✔
1295

1296
        if (state.equals(DatasetState.DEACCESSIONED)) {
1✔
1297
            String deaccessionNote = datasetVersion.getVersionNote();
×
1298
            if (deaccessionNote != null) {
×
1299
                solrInputDocument.addField(SearchFields.DATASET_DEACCESSION_REASON, deaccessionNote);
×
1300
            }
1301
        }
1302
        docs.add(solrInputDocument);
1✔
1303

1304
        /**
1305
         * File Indexing
1306
         */
1307
        boolean doFullTextIndexing = settingsService.isTrueForKey(SettingsServiceBean.Key.SolrFullTextIndexing, false);
1✔
1308
        Long maxFTIndexingSize = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.SolrMaxFileSizeForFullTextIndexing);
1✔
1309
        long maxSize = maxFTIndexingSize != null ? maxFTIndexingSize.longValue() : Long.MAX_VALUE;
1✔
1310

1311
        List<String> filesIndexed = new ArrayList<>();
1✔
1312
        if (datasetVersion != null) {
1✔
1313
            List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
1✔
1314
            List<FileMetadata> releasedFileMetadatas = new ArrayList<>();
1✔
1315
            Map<Long, FileMetadata> fileMap = new HashMap<>();
1✔
1316
            boolean checkForDuplicateMetadata = false;
1✔
1317
            if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) {
1✔
1318
                checkForDuplicateMetadata = true;
×
1319
                releasedFileMetadatas = dataset.getReleasedVersion().getFileMetadatas(); 
×
1320
                for(FileMetadata released: releasedFileMetadatas){
×
1321
                    fileMap.put(released.getDataFile().getId(), released);
×
1322
                }
×
1323
                logger.fine(
×
1324
                        "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
1325
            }
1326
            LocalDate embargoEndDate=null;
1✔
1327
            LocalDate retentionEndDate=null;
1✔
1328
            final String datasetCitation = dataset.getCitation();
1✔
1329
            final Long datasetId = dataset.getId();
1✔
1330
            final String datasetGlobalId = dataset.getGlobalId().toString();
1✔
1331
            for (FileMetadata fileMetadata : fileMetadatas) {
1✔
1332
                LocalDate end = null;
×
1333
                LocalDate start = null;
×
1334
                Embargo emb= fileMetadata.getDataFile().getEmbargo();
×
1335
                if(emb!=null) {
×
1336
                    end = emb.getDateAvailable();
×
1337
                    if(embargoEndDate==null || end.isAfter(embargoEndDate)) {
×
1338
                        embargoEndDate=end;
×
1339
                    }
1340
                }
1341
                Retention ret= fileMetadata.getDataFile().getRetention();
×
1342
                if(ret!=null) {
×
1343
                    start = ret.getDateUnavailable();
×
1344
                    if(retentionEndDate==null || start.isBefore(retentionEndDate)) {
×
1345
                        retentionEndDate=start;
×
1346
                    }
1347
                }
1348
                boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed();
×
1349
                if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) {
×
1350
                    logger.fine("Checking if this file metadata is a duplicate.");
×
1351
                    FileMetadata getFromMap = fileMap.get(fileMetadata.getDataFile().getId());
×
1352
                    if (getFromMap != null) {
×
1353
                        if ((fileMetadata.getDataFile().isRestricted() == getFromMap.getDataFile().isRestricted())) {
×
1354
                            if (fileMetadata.contentEquals(getFromMap)
×
1355
                                    && VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) {
×
1356
                                indexThisMetadata = false;
×
1357
                                logger.fine("This file metadata hasn't changed since the released version; skipping indexing.");
×
1358
                            } else {
1359
                                logger.fine("This file metadata has changed since the released version; we want to index it!");
×
1360
                            }
1361
                        } else {
1362
                            logger.fine("This file's restricted status has changed since the released version; we want to index it!");
×
1363
                        }
1364
                    }
1365
                }        
1366
                if (indexThisMetadata) {
×
1367

1368
                    SolrInputDocument datafileSolrInputDocument = new SolrInputDocument();
×
1369
                    Long fileEntityId = fileMetadata.getDataFile().getId();
×
1370
                    datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId);
×
1371
                    datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
×
1372
                    datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId);
×
1373
                    datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
×
1374
                    datafileSolrInputDocument.addField(SearchFields.TYPE, "files");
×
1375
                    datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dvIndexableCategoryName);
×
1376
                    if(end!=null) {
×
1377
                        datafileSolrInputDocument.addField(SearchFields.EMBARGO_END_DATE, end.toEpochDay()); 
×
1378
                    }
1379
                    if(start!=null) {
×
1380
                        datafileSolrInputDocument.addField(SearchFields.RETENTION_END_DATE, start.toEpochDay());
×
1381
                    }
1382
                    /* Full-text indexing using Apache Tika */
1383
                    if (doFullTextIndexing) {
×
1384
                        if (!dataset.isHarvested() && !fileMetadata.getDataFile().isRestricted()
×
1385
                                && !fileMetadata.getDataFile().isFilePackage()
×
1386
                                && fileMetadata.getDataFile().getRetention() == null) {
×
1387
                            StorageIO<DataFile> accessObject = null;
×
1388
                            InputStream instream = null;
×
1389
                            ContentHandler textHandler = null;
×
1390
                            try {
1391
                                accessObject = DataAccess.getStorageIO(fileMetadata.getDataFile(),
×
1392
                                        new DataAccessRequest());
1393
                                if (accessObject != null) {
×
1394
                                    accessObject.open();
×
1395
                                    // If the size is >max, we don't use the stream. However, for S3, the stream is
1396
                                    // currently opened in the call above (see
1397
                                    // https://github.com/IQSS/dataverse/issues/5165), so we want to get a handle so
1398
                                    // we can close it below.
1399
                                    instream = accessObject.getInputStream();
×
1400
                                    if (accessObject.getSize() <= maxSize) {
×
1401
                                        AutoDetectParser autoParser = new AutoDetectParser();
×
1402
                                        textHandler = new BodyContentHandler(-1);
×
1403
                                        Metadata metadata = new Metadata();
×
1404
                                        ParseContext context = new ParseContext();
×
1405
                                        /*
1406
                                         * Try parsing the file. Note that, other than by limiting size, there's been no
1407
                                         * check see whether this file is a good candidate for text extraction (e.g.
1408
                                         * based on type).
1409
                                         */
1410
                                        autoParser.parse(instream, textHandler, metadata, context);
×
1411
                                        datafileSolrInputDocument.addField(SearchFields.FULL_TEXT,
×
1412
                                                textHandler.toString());
×
1413
                                    }
1414
                                }
1415
                            } catch (Exception e) {
×
1416
                                // Needs better logging of what went wrong in order to
1417
                                // track down "bad" documents.
1418
                                logger.warning(String.format("Full-text indexing for %s failed",
×
1419
                                        fileMetadata.getDataFile().getDisplayName()));
×
1420
                                e.printStackTrace();
×
1421
                            } catch (OutOfMemoryError e) {
×
1422
                                textHandler = null;
×
1423
                                logger.warning(String.format("Full-text indexing for %s failed due to OutOfMemoryError",
×
1424
                                        fileMetadata.getDataFile().getDisplayName()));
×
1425
                            } catch(Error e) {
×
1426
                                //Catch everything - full-text indexing is complex enough (and using enough 3rd party components) that it can fail
1427
                                // and we don't want problems here to break other Dataverse functionality (e.g. edits)
1428
                                logger.severe(String.format("Full-text indexing for %s failed due to Error: %s : %s",
×
1429
                                        fileMetadata.getDataFile().getDisplayName(),e.getClass().getCanonicalName(), e.getLocalizedMessage()));
×
1430
                            } finally {
1431
                                IOUtils.closeQuietly(instream);
×
1432
                            }
1433
                        }
1434
                    }
1435

1436
                    String filenameCompleteFinal = "";
×
1437
                    if (fileMetadata != null) {
×
1438
                        String filenameComplete = fileMetadata.getLabel();
×
1439
                        if (filenameComplete != null) {
×
1440
                            String filenameWithoutExtension = "";
×
1441
                            // String extension = "";
1442
                            int i = filenameComplete.lastIndexOf('.');
×
1443
                            if (i > 0) {
×
1444
                                // extension = filenameComplete.substring(i + 1);
1445
                                try {
1446
                                    filenameWithoutExtension = filenameComplete.substring(0, i);
×
1447
                                    datafileSolrInputDocument.addField(SearchFields.FILENAME_WITHOUT_EXTENSION, filenameWithoutExtension);
×
1448
                                    datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameWithoutExtension);
×
1449
                                } catch (IndexOutOfBoundsException ex) {
×
1450
                                    filenameWithoutExtension = "";
×
1451
                                }
×
1452
                            } else {
1453
                                logger.fine("problem with filename '" + filenameComplete + "': no extension? empty string as filename?");
×
1454
                                filenameWithoutExtension = filenameComplete;
×
1455
                            }
1456
                            filenameCompleteFinal = filenameComplete;
×
1457
                        }
1458
                        for (String tag : fileMetadata.getCategoriesByName()) {
×
1459
                            datafileSolrInputDocument.addField(SearchFields.FILE_TAG, tag);
×
1460
                            datafileSolrInputDocument.addField(SearchFields.FILE_TAG_SEARCHABLE, tag);
×
1461
                        }
×
1462
                    }
1463
                    datafileSolrInputDocument.addField(SearchFields.NAME, filenameCompleteFinal);
×
1464
                    datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal);
×
1465
                    datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal);
×
1466

1467
                    datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
×
1468
                    addLicenseToSolrDoc(datafileSolrInputDocument, datasetVersion);
×
1469

1470
                    /**
1471
                     * for rules on sorting files see
1472
                     * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing
1473
                     * via https://redmine.hmdc.harvard.edu/issues/3701
1474
                     */
1475
                    Date fileSortByDate = new Date();
×
1476
                    DataFile datafile = fileMetadata.getDataFile();
×
1477
                    if (datafile != null) {
×
1478
                        boolean fileHasBeenReleased = datafile.isReleased();
×
1479
                        if (fileHasBeenReleased) {
×
1480
                            logger.fine("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
×
1481
                            Timestamp filePublicationTimestamp = datafile.getPublicationDate();
×
1482
                            if (filePublicationTimestamp != null) {
×
1483
                                fileSortByDate = filePublicationTimestamp;
×
1484
                            } else {
1485
                                String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
×
1486
                                logger.info(msg);
×
1487
                            }
1488
                            datafileSolrInputDocument.addField(SearchFields.ACCESS,
×
1489
                                    FileUtil.isRetentionExpired(datafile)
×
1490
                                        ? SearchConstants.RETENTIONEXPIRED :
×
1491
                                            FileUtil.isActivelyEmbargoed(datafile)
×
1492
                                                ? (fileMetadata.isRestricted() ? SearchConstants.EMBARGOEDTHENRESTRICTED
×
1493
                                                        : SearchConstants.EMBARGOEDTHENPUBLIC)
×
1494
                                                : (fileMetadata.isRestricted() ? SearchConstants.RESTRICTED
×
1495
                                                        : SearchConstants.PUBLIC));
×
1496
                        } else {
×
1497
                            logger.fine("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
×
1498
                            Timestamp fileCreateTimestamp = datafile.getCreateDate();
×
1499
                            if (fileCreateTimestamp != null) {
×
1500
                                fileSortByDate = fileCreateTimestamp;
×
1501
                            } else {
1502
                                String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
×
1503
                                logger.info(msg);
×
1504
                            }
1505
                            datafileSolrInputDocument.addField(SearchFields.ACCESS,
×
1506
                                    FileUtil.isActivelyEmbargoed(fileMetadata)
×
1507
                                            ? (fileMetadata.isRestricted() ? SearchConstants.EMBARGOEDTHENRESTRICTED
×
1508
                                                    : SearchConstants.EMBARGOEDTHENPUBLIC)
×
1509
                                            : (fileMetadata.isRestricted() ? SearchConstants.RESTRICTED
×
1510
                                                    : SearchConstants.PUBLIC));
×
1511
                        }
1512
                        if (datafile.isHarvested()) {
×
1513
                            datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, true);
×
1514
                            if (FeatureFlags.INDEX_HARVESTED_METADATA_SOURCE.enabled()) {
×
1515
                                // New - as of 6.3 - option of indexing the actual origin of 
1516
                                // harvested objects as the metadata source:
1517
                                datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE,
×
1518
                                        dataset.getHarvestedFrom() != null ? dataset.getHarvestedFrom().getName() : HARVESTED);
×
1519
                            } else {
1520
                                datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
×
1521
                            }
1522
                        } else {
1523
                            datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, false);
×
1524
                            datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, rdvName);
×
1525
                        }
1526
                    }
1527
                    if (fileSortByDate == null) {
×
1528
                        if (datasetSortByDate != null) {
×
1529
                            logger.info("fileSortByDate was null, assigning datasetSortByDate");
×
1530
                            fileSortByDate = datasetSortByDate;
×
1531
                        } else {
1532
                            logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'");
×
1533
                            fileSortByDate = new Date();
×
1534
                        }
1535
                    }
1536
                    datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate);
×
1537

1538
                    if (majorVersionReleaseDate == null && !datafile.isHarvested()) {
×
1539
                        datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
×
1540
                    }
1541

1542
                    if (datasetVersion.isInReview()) {
×
1543
                        datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
×
1544
                    }
1545

1546
                    String fileSolrDocId = solrDocIdentifierFile + fileEntityId;
×
1547
                    if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) {
×
1548
                        fileSolrDocId = solrDocIdentifierFile + fileEntityId;
×
1549
                        datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
×
1550
                        if (FeatureFlags.ADD_PUBLICOBJECT_SOLR_FIELD.enabled()) {
×
1551
                            datafileSolrInputDocument.addField(SearchFields.PUBLIC_OBJECT, true);
×
1552
                        }
1553
                        // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString);
1554
                        addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset);
×
1555
                        // has this published file been deleted from the current draft version? 
1556
                        if (datafilesInDraftVersion != null && !datafilesInDraftVersion.contains(datafile.getId())) {
×
1557
                            datafileSolrInputDocument.addField(SearchFields.FILE_DELETED, true);
×
1558
                        }
1559
                    } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) {
×
1560
                        fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix();
×
1561
                        datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
×
1562
                    }
1563
                    datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId);
×
1564

1565
                    datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType());
×
1566
                    datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType());
×
1567
                    datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType());
×
1568
                    // For the file type facets, we have a property file that maps mime types
1569
                    // to facet-friendly names; "application/fits" should become "FITS", etc.:
1570
                    datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile()));
×
1571
                    datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile()));
×
1572
                    datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize());
×
1573
                    if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) {
×
1574
                        /**
1575
                         * @todo Someday we should probably deprecate this
1576
                         * FILE_MD5 in favor of a combination of
1577
                         * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE.
1578
                         */
1579
                        datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue());
×
1580
                    }
1581
                    datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString());
×
1582
                    datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue());
×
1583
                    datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription());
×
1584
                    datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription());
×
1585
                    GlobalId filePid = fileMetadata.getDataFile().getGlobalId();
×
1586
                    datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID,
×
1587
                            (filePid != null) ? filePid.toString() : null);
×
1588
                    datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf());
×
1589
                    datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
×
1590
                    // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE,
1591
                    // dataFile.getOwner().getOwner().getName());
1592
                    // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME,
1593
                    // dataFile.getDataset().getTitle());
1594
                    datafileSolrInputDocument.addField(SearchFields.PARENT_ID, datasetId);
×
1595
                    datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, datasetGlobalId);
×
1596
                    datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, datasetCitation);
×
1597

1598
                    datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle);
×
1599

1600
                    // If this is a tabular data file -- i.e., if there are data
1601
                    // variables associated with this file, we index the variable
1602
                    // names and labels:
1603
                    if (fileMetadata.getDataFile().isTabularData()) {
×
1604
                        List<DataVariable> variables = fileMetadata.getDataFile().getDataTable().getDataVariables();
×
1605
                        
1606
                        Map<Long, VariableMetadata> variableMap = null;
×
1607
                        List<VariableMetadata> variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId());
×
1608

1609
                        variableMap = 
×
1610
                            variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); 
×
1611
    
1612
                                      
1613
                        for (DataVariable var : variables) {
×
1614
                            // Hard-coded search fields, for now:
1615
                            // TODO: eventually: review, decide how datavariables should
1616
                            // be handled for indexing purposes. (should it be a fixed
1617
                            // setup, defined in the code? should it be flexible? unlikely
1618
                            // that this needs to be domain-specific... since these data
1619
                            // variables are quite specific to tabular data, which in turn
1620
                            // is something social science-specific...
1621
                            // anyway -- needs to be reviewed. -- L.A. 4.0alpha1
1622

1623
                            //Variable Name
1624
                            if (var.getName() != null && !var.getName().equals("")) {
×
1625
                                datafileSolrInputDocument.addField(SearchFields.VARIABLE_NAME, var.getName());
×
1626
                            }
1627
                            
1628
                            VariableMetadata vm = variableMap.get(var.getId()); 
×
1629
                            if (vm == null) {    
×
1630
                                //Variable Label
1631
                                if (var.getLabel() != null && !var.getLabel().equals("")) {
×
1632
                                    datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, var.getLabel());
×
1633
                                }
1634
                            } else {
1635
                                if (vm.getLabel() != null && !vm.getLabel().equals("")  ) {
×
1636
                                    datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, vm.getLabel());
×
1637
                                }
1638
                                if (vm.getLiteralquestion() != null && !vm.getLiteralquestion().equals("")) {
×
1639
                                    datafileSolrInputDocument.addField(SearchFields.LITERAL_QUESTION, vm.getLiteralquestion());
×
1640
                                }
1641
                                if (vm.getInterviewinstruction() != null && !vm.getInterviewinstruction().equals("")) {
×
1642
                                    datafileSolrInputDocument.addField(SearchFields.INTERVIEW_INSTRUCTIONS, vm.getInterviewinstruction());
×
1643
                                }
1644
                                if (vm.getPostquestion() != null && !vm.getPostquestion().equals("")) {
×
1645
                                    datafileSolrInputDocument.addField(SearchFields.POST_QUESTION, vm.getPostquestion());
×
1646
                                }
1647
                                if (vm.getUniverse() != null && !vm.getUniverse().equals("")) {
×
1648
                                    datafileSolrInputDocument.addField(SearchFields.VARIABLE_UNIVERSE, vm.getUniverse());
×
1649
                                }
1650
                                if (vm.getNotes() != null && !vm.getNotes().equals("")) {
×
1651
                                    datafileSolrInputDocument.addField(SearchFields.VARIABLE_NOTES, vm.getNotes());
×
1652
                                }
1653

1654
                            }
1655
                        }
×
1656
                        
1657
                        // TABULAR DATA TAGS:
1658
                        // (not to be confused with the file categories, indexed above!)
1659
                        for (DataFileTag tag : fileMetadata.getDataFile().getTags()) {
×
1660
                            String tagLabel = tag.getTypeLabel();
×
1661
                            datafileSolrInputDocument.addField(SearchFields.TABDATA_TAG, tagLabel);
×
1662
                        }
×
1663
                    }
1664

1665
                    filesIndexed.add(fileSolrDocId);
×
1666
                    docs.add(datafileSolrInputDocument);
×
1667
                }
1668
            }
×
1669
            if(embargoEndDate!=null) {
1✔
1670
              solrInputDocument.addField(SearchFields.EMBARGO_END_DATE, embargoEndDate.toEpochDay());
×
1671
            }
1672
            if(retentionEndDate!=null) {
1✔
1673
                solrInputDocument.addField(SearchFields.RETENTION_END_DATE, retentionEndDate.toEpochDay());
×
1674
            }
1675
        }
1676
        Long datasetId = dataset.getId();
1✔
1677
        final String msg = "indexed dataset " + datasetId + " as " + datasetSolrDocId + ". filesIndexed: " + filesIndexed;
1✔
1678
        return new SolrInputDocuments(docs, msg, datasetId);
1✔
1679
    }
1680
    
1681
    private String addOrUpdateDataset(IndexableDataset indexableDataset, Set<Long> datafilesInDraftVersion) throws  SolrServerException, IOException {   
1682
        final SolrInputDocuments docs = toSolrDocs(indexableDataset, datafilesInDraftVersion);
×
1683

1684
        try {
1685
            solrClientService.getSolrClient().add(docs.getDocuments());
×
1686
        } catch (SolrServerException | IOException ex) {
×
1687
            if (ex.getCause() instanceof SolrServerException) {
×
1688
                throw new SolrServerException(ex);
×
1689
            } else if (ex.getCause() instanceof IOException) {
×
1690
                throw new IOException(ex);
×
1691
            }
1692
        }
×
1693
        return docs.getMessage();
×
1694
    }
1695

1696
    @Asynchronous
1697
    private void updateLastIndexedTime(Long id) {
1698
        // indexing is often in a transaction with update statements
1699
        // if we flush on query (flush-mode auto), we want to prevent locking
1700
        // -> update the dataset asynchronously in a new transaction
1701
        updateLastIndexedTimeInNewTransaction(id);
×
1702
    }
×
1703

1704
    @TransactionAttribute(REQUIRES_NEW)
1705
    private void updateLastIndexedTimeInNewTransaction(Long id) {
1706
        /// Dataset updatedDataset =
1707
        /// (Dataset)dvObjectService.updateContentIndexTime(dataset);
1708
        /// updatedDataset = null;
1709
        // instead of making a call to dvObjectService, let's try and
1710
        // modify the index time stamp using the local EntityManager:
1711
        DvObject dvObjectToModify = em.find(DvObject.class, id);
×
1712
        dvObjectToModify.setIndexTime(new Timestamp(new Date().getTime()));
×
1713
        dvObjectToModify = em.merge(dvObjectToModify);
×
1714
        em.flush();
×
1715
    }
×
1716

1717
    /**
1718
     * If the "Topic Classification" has a "Vocabulary", return both the "Term"
1719
     * and the "Vocabulary" with the latter in parentheses. For example, the
1720
     * Murray Research Archive uses "1 (Generations)" and "yes (Follow-up
1721
     * permitted)".
1722
     */
1723
    private String getTopicClassificationTermOrTermAndVocabulary(DatasetField topicClassDatasetField) {
1724
        String finalValue = null;
×
1725
        String topicClassVocab = null;
×
1726
        String topicClassValue = null;
×
1727
        for (DatasetField sibling : topicClassDatasetField.getParentDatasetFieldCompoundValue().getChildDatasetFields()) {
×
1728
            DatasetFieldType datasetFieldType = sibling.getDatasetFieldType();
×
1729
            String name = datasetFieldType.getName();
×
1730
            if (name.equals(DatasetFieldConstant.topicClassVocab)) {
×
1731
                topicClassVocab = sibling.getDisplayValue();
×
1732
            } else if (name.equals(DatasetFieldConstant.topicClassValue)) {
×
1733
                topicClassValue = sibling.getDisplayValue();
×
1734
            }
1735
            if (topicClassValue != null) {
×
1736
                if (topicClassVocab != null) {
×
1737
                    finalValue = topicClassValue + " (" + topicClassVocab + ")";
×
1738
                } else {
1739
                    finalValue = topicClassValue;
×
1740
                }
1741
            }
1742
        }
×
1743
        return finalValue;
×
1744
    }
1745

1746
    public List<String> findPathSegments(Dataverse dataverse, List<String> segments) {
1747
        return findPathSegments(dataverse, segments, null);
1✔
1748
    }
1749

1750
    public List<String> findPathSegments(Dataverse dataverse, List<String> segments, Dataverse topOfPath) {
1751
        Dataverse rootDataverse = findRootDataverseCached();
1✔
1752
        if (topOfPath == null) {
1✔
1753
            topOfPath = rootDataverse;
1✔
1754
        }
1755
        if (!dataverse.equals(rootDataverse)) {
1✔
1756
            // important when creating root dataverse
1757
            if (dataverse.getOwner() != null) {
×
1758
                findPathSegments(dataverse.getOwner(), segments, topOfPath);
×
1759
            }
1760
            segments.add(dataverse.getId().toString());
×
1761
            return segments;
×
1762
        } else {
1763
            // base case
1764
            return segments;
1✔
1765
        }
1766
    }
1767
        
1768
    private boolean hasAnyLinkingDataverses(Dataverse dataverse) {
1769
        Dataverse rootDataverse = findRootDataverseCached();
×
1770
        List<Dataverse> ancestorList = dataverse.getOwners();
×
1771
        ancestorList.add(dataverse);
×
1772
        for (Dataverse prior : ancestorList) {
×
1773
            if (!dataverse.equals(rootDataverse)) {
×
1774
                List<Dataverse> linkingDVs = dvLinkingService.findLinkingDataverses(prior.getId());
×
1775
                if (!linkingDVs.isEmpty()){
×
1776
                    return true;
×
1777
                }
1778
            }
1779
        }       
×
1780
        return false;
×
1781
    }
1782
    
1783
    private List<Dataverse> findAllLinkingDataverses(DvObject dvObject){
1784
        /*
1785
        here we find the linking dataverse of the input object
1786
        then any linked dvs in its owners list
1787
        */
1788
        Dataset dataset = null;
1✔
1789
        Dataverse dv = null;
1✔
1790
        Dataverse rootDataverse = findRootDataverseCached();        
1✔
1791
        List <Dataverse>linkingDataverses = new ArrayList();
1✔
1792
        List<Dataverse> ancestorList = new ArrayList();
1✔
1793
        
1794
        try {
1795
            if(dvObject.isInstanceofDataset()){
1✔
1796
                dataset = (Dataset) dvObject;
1✔
1797
                linkingDataverses = dsLinkingService.findLinkingDataverses(dataset.getId());
×
1798
                ancestorList = dataset.getOwner().getOwners();
×
1799
                ancestorList.add(dataset.getOwner()); //to show dataset in linking dv when parent dv is linked
×
1800
            }
1801
            if(dvObject.isInstanceofDataverse()){
×
1802
                dv = (Dataverse) dvObject;
×
1803
                linkingDataverses = dvLinkingService.findLinkingDataverses(dv.getId());
×
1804
                ancestorList = dv.getOwners();
×
1805
            }
1806
        } catch (Exception ex) {
1✔
1807
            logger.info("failed to find Linking Dataverses for " + SearchFields.SUBTREE + ": " + ex);
1✔
1808
        }
×
1809
        
1810
        for (Dataverse owner : ancestorList) {
1✔
1811
            if (!owner.equals(rootDataverse)) {
×
1812
            linkingDataverses.addAll(dvLinkingService.findLinkingDataverses(owner.getId()));
×
1813
            }
1814
        }       
×
1815
        
1816
        return linkingDataverses;
1✔
1817
    }
1818
    
1819
    private List<String> findLinkingDataversePaths(List<Dataverse> linkingDVs) {
1820

1821
        List<String> pathListAccumulator = new ArrayList<>();
1✔
1822
        for (Dataverse toAdd : linkingDVs) {
1✔
1823
            //get paths for each linking dataverse
1824
            List<String> linkingDataversePathSegmentsAccumulator = findPathSegments(toAdd, new ArrayList<>());
×
1825
            List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingDataversePathSegmentsAccumulator);
×
1826
            for (String dvPath : linkingDataversePaths) {
×
1827
                if (!pathListAccumulator.contains(dvPath)) {
×
1828
                    pathListAccumulator.add(dvPath);
×
1829
                }
1830
            }
×
1831
        }
×
1832

1833
        return pathListAccumulator;
1✔
1834
    }
1835

1836
    private List<String> getDataversePathsFromSegments(List<String> dataversePathSegments) {
1837
        List<String> subtrees = new ArrayList<>();
1✔
1838
        for (int i = 0; i < dataversePathSegments.size(); i++) {
1✔
1839
            StringBuilder pathBuilder = new StringBuilder();
×
1840
            int numSegments = dataversePathSegments.size();
×
1841
            for (int j = 0; j < numSegments; j++) {
×
1842
                if (j <= i) {
×
1843
                    pathBuilder.append("/" + dataversePathSegments.get(j));
×
1844
                }
1845
            }
1846
            subtrees.add(pathBuilder.toString());
×
1847
        }
1848
        return subtrees;
1✔
1849
    }
1850

1851
    private void addLicenseToSolrDoc(SolrInputDocument solrInputDocument, DatasetVersion datasetVersion) {
1852
        if (datasetVersion != null && datasetVersion.getTermsOfUseAndAccess() != null) {
1✔
1853
            //test to see if the terms of use are the default set in 5.10 - if so and there's no license then don't add license to solr doc.   
1854
            //fixes 10513
1855
            if (datasetVersionService.isVersionDefaultCustomTerms(datasetVersion)){
1✔
1856
                return; 
×
1857
            }
1858
            
1859
            String licenseName = "Custom Terms";
1✔
1860
            if (datasetVersion.getTermsOfUseAndAccess().getLicense() != null) {
1✔
1861
                licenseName = datasetVersion.getTermsOfUseAndAccess().getLicense().getName();
×
1862
            } else if (datasetVersion.getTermsOfUseAndAccess().getTermsOfUse() == null) {
1✔
1863
                // this fixes #10513 for datasets harvested in oai_dc - these 
1864
                // have neither the license id, nor any actual custom terms 
1865
                return; 
1✔
1866
            }
1867
            solrInputDocument.addField(SearchFields.DATASET_LICENSE, licenseName);
×
1868
        }
1869
    }
×
1870

1871
    private void addDataverseReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataverse dataverse) {
1872
        if (dataverse.getPublicationDate() != null) {
×
1873
            Calendar calendar = Calendar.getInstance();
×
1874
            calendar.setTimeInMillis(dataverse.getPublicationDate().getTime());
×
1875
            int YYYY = calendar.get(Calendar.YEAR);
×
1876
            solrInputDocument.addField(SearchFields.PUBLICATION_YEAR, YYYY);
×
1877
        }
1878
    }
×
1879

1880
    private void addDatasetReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataset dataset) {
1881
        if (dataset.getPublicationDate() != null) {
1✔
1882
            Calendar calendar = Calendar.getInstance();
×
1883
            calendar.setTimeInMillis(dataset.getPublicationDate().getTime());
×
1884
            int YYYY = calendar.get(Calendar.YEAR);
×
1885
            solrInputDocument.addField(SearchFields.PUBLICATION_YEAR, YYYY);
×
1886
            solrInputDocument.addField(SearchFields.DATASET_PUBLICATION_DATE, YYYY);
×
1887
        }
1888
    }
1✔
1889

1890
    public static String getGroupPrefix() {
1891
        return groupPrefix;
×
1892
    }
1893

1894
    public static String getGroupPerUserPrefix() {
1895
        return groupPerUserPrefix;
×
1896
    }
1897

1898
    public static String getPublicGroupString() {
1899
        return publicGroupString;
1✔
1900
    }
1901

1902
    public static String getPUBLISHED_STRING() {
1903
        return PUBLISHED_STRING;
1✔
1904
    }
1905

1906
    public static String getUNPUBLISHED_STRING() {
1907
        return UNPUBLISHED_STRING;
1✔
1908
    }
1909

1910
    public static String getDRAFT_STRING() {
1911
        return DRAFT_STRING;
1✔
1912
    }
1913

1914
    public static String getIN_REVIEW_STRING() {
1915
        return IN_REVIEW_STRING;
1✔
1916
    }
1917

1918
    public static String getDEACCESSIONED_STRING() {
1919
        return DEACCESSIONED_STRING;
1✔
1920
    }
1921
    
1922
    
1923
    
1924
    private void updatePathForExistingSolrDocs(DvObject object) throws SolrServerException, IOException {
1925
        SolrQuery solrQuery = new SolrQuery();
×
1926
        solrQuery.setQuery(SearchUtil.constructQuery(SearchFields.ENTITY_ID, object.getId().toString()));
×
1927

1928
        QueryResponse res = solrClientService.getSolrClient().query(solrQuery);
×
1929
        
1930
        if (!res.getResults().isEmpty()) {            
×
1931
            SolrDocument doc = res.getResults().get(0);
×
1932
            SolrInputDocument sid = new SolrInputDocument();
×
1933

1934
            for (String fieldName : doc.getFieldNames()) {
×
1935
                sid.addField(fieldName, doc.getFieldValue(fieldName));
×
1936
            }
×
1937

1938
            Dataset dataset = null;
×
1939
            if (object.isInstanceofDataset()) {
×
1940
                dataset = datasetService.findDeep(object.getId());
×
1941
            }
1942
            List<String> paths = object.isInstanceofDataset() ? retrieveDVOPaths(dataset)
×
1943
                    : retrieveDVOPaths(dataverseService.find(object.getId()));
×
1944

1945
            sid.removeField(SearchFields.SUBTREE);
×
1946
            sid.addField(SearchFields.SUBTREE, paths);
×
1947
            UpdateResponse addResponse = solrClientService.getSolrClient().add(sid);
×
1948
            if (object.isInstanceofDataset()) {
×
1949
                for (DataFile df : dataset.getFiles()) {
×
1950
                    solrQuery.setQuery(SearchUtil.constructQuery(SearchFields.ENTITY_ID, df.getId().toString()));
×
1951
                    res = solrClientService.getSolrClient().query(solrQuery);
×
1952
                    if (!res.getResults().isEmpty()) {
×
1953
                        doc = res.getResults().get(0);
×
1954
                        sid = new SolrInputDocument();
×
1955
                        for (String fieldName : doc.getFieldNames()) {
×
1956
                            sid.addField(fieldName, doc.getFieldValue(fieldName));
×
1957
                        }
×
1958
                        sid.removeField(SearchFields.SUBTREE);
×
1959
                        sid.addField(SearchFields.SUBTREE, paths);
×
1960
                        addResponse = solrClientService.getSolrClient().add(sid);
×
1961
                    }
1962
                }
×
1963
            }
1964
        }            
1965
    }
×
1966
    
1967
    
1968
    private List<String> retrieveDVOPaths(DvObject dvo) {
1969
        List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
1✔
1970
        List<String> dataverseSegments = new ArrayList<>();
1✔
1971
        Dataset dataset = null;
1✔
1972
        Dataverse dv = null;
1✔
1973
        try {
1974
            if(dvo.isInstanceofDataset()){
1✔
1975
                dataset = (Dataset) dvo;
1✔
1976
                dataverseSegments = findPathSegments(dataset.getOwner(), dataversePathSegmentsAccumulator);
1✔
1977
            }
1978
            if(dvo.isInstanceofDataverse()){
1✔
1979
                dv = (Dataverse) dvo;
×
1980
                dataverseSegments = findPathSegments(dv, dataversePathSegmentsAccumulator);
×
1981
            }
1982
        } catch (Exception ex) {
×
1983
            logger.info("failed to find dataverseSegments for dataversePaths for " + SearchFields.SUBTREE + ": " + ex);
×
1984
        }        
1✔
1985
        List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
1✔
1986
        if (dataversePaths.size() > 0 && dvo.isInstanceofDataverse()) {
1✔
1987
            // removing the dataverse's own id from the paths
1988
            // fixes bug where if my parent dv was linked my dv was shown as linked to myself
1989
            dataversePaths.remove(dataversePaths.size() - 1);
×
1990
        }
1991
        /*
1992
        add linking paths
1993
        */
1994
        dataversePaths.addAll(findLinkingDataversePaths(findAllLinkingDataverses(dvo)));
1✔
1995
        return dataversePaths;
1✔
1996
    }
1997

1998
    public String delete(Dataverse doomed) {
1999
        logger.fine("deleting Solr document for dataverse " + doomed.getId());
×
2000
        UpdateResponse updateResponse;
2001
        try {
2002
            updateResponse = solrClientService.getSolrClient().deleteById(solrDocIdentifierDataverse + doomed.getId());
×
2003
        } catch (SolrServerException | IOException ex) {
×
2004
            return ex.toString();
×
2005
        }
×
2006
        String response = "Successfully deleted dataverse " + doomed.getId() + " from Solr index. updateReponse was: " + updateResponse.toString();
×
2007
        logger.fine(response);
×
2008
        return response;
×
2009
    }
2010

2011
    /**
2012
     * @todo call this in fewer places, favoring
2013
     * SolrIndexServiceBeans.deleteMultipleSolrIds instead to operate in batches
2014
     *
2015
     * https://github.com/IQSS/dataverse/issues/142
2016
     */
2017
    public String removeSolrDocFromIndex(String doomed) {
2018

2019
        logger.fine("deleting Solr document: " + doomed);
×
2020
        UpdateResponse updateResponse;
2021
        try {
2022
            updateResponse = solrClientService.getSolrClient().deleteById(doomed);
×
2023
        } catch (SolrServerException | IOException ex) {
×
2024
            return ex.toString();
×
2025
        }
×
2026
        String response = "Attempted to delete " + doomed + " from Solr index. updateReponse was: " + updateResponse.toString();
×
2027
        logger.fine(response);
×
2028
        return response;
×
2029
    }
2030

2031

2032
    private List<String> findSolrDocIdsForDraftFilesToDelete(Dataset datasetWithDraftFilesToDelete) {
2033
        List<String> solrIdsOfFilesToDelete = new ArrayList<>();
×
2034
        for (DatasetVersion datasetVersion : datasetWithDraftFilesToDelete.getVersions()) {
×
2035
            for (FileMetadata fileMetadata : datasetVersion.getFileMetadatas()) {
×
2036
                DataFile datafile = fileMetadata.getDataFile();
×
2037
                if (datafile != null) {
×
2038
                    solrIdsOfFilesToDelete.add(solrDocIdentifierFile + datafile.getId() + draftSuffix);
×
2039
                }
2040
            }
×
2041

2042
        }
×
2043
        return solrIdsOfFilesToDelete;
×
2044
    }
2045

2046
    private List<String> findSolrDocIdsForFilesToDelete(Dataset dataset, IndexableDataset.DatasetState state) {
2047
        List<String> solrIdsOfFilesToDelete = new ArrayList<>();
×
2048
        for (DataFile file : dataset.getFiles()) {
×
2049
            solrIdsOfFilesToDelete.add(solrDocIdentifierFile + file.getId() + state.getSuffix());
×
2050
        }
×
2051
        return solrIdsOfFilesToDelete;
×
2052
    }
2053

2054
    private String removeMultipleSolrDocs(List<String> docIds) {
2055
        IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(docIds);
×
2056
        return indexResponse.toString();
×
2057
    }
2058

2059
    private String determinePublishedDatasetSolrDocId(Dataset dataset) {
2060
        return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.PUBLISHED.getSuffix();
×
2061
    }
2062

2063
    private String determineDeaccessionedDatasetId(Dataset dataset) {
2064
        return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix();
×
2065
    }
2066

2067
    //Only used when FeatureFlags.REDUCE_SOLR_DELETES is disabled
2068
    private String removeDeaccessioned(Dataset dataset) {
2069
        StringBuilder result = new StringBuilder();
×
2070
        String deleteDeaccessionedResult = removeSolrDocFromIndex(determineDeaccessionedDatasetId(dataset));
×
2071
        result.append(deleteDeaccessionedResult);
×
2072
        List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.DEACCESSIONED);
×
2073
        String deleteFilesResult = removeMultipleSolrDocs(docIds);
×
2074
        result.append(deleteFilesResult);
×
2075
        return result.toString();
×
2076
    }
2077

2078
    //Only used when FeatureFlags.REDUCE_SOLR_DELETES is disabled
2079
    private String removePublished(Dataset dataset) {
2080
        StringBuilder result = new StringBuilder();
×
2081
        String deletePublishedResult = removeSolrDocFromIndex(determinePublishedDatasetSolrDocId(dataset));
×
2082
        result.append(deletePublishedResult);
×
2083
        List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.PUBLISHED);
×
2084
        String deleteFilesResult = removeMultipleSolrDocs(docIds);
×
2085
        result.append(deleteFilesResult);
×
2086
        return result.toString();
×
2087
    }
2088
    
2089
    // Only used when FeatureFlags.REDUCE_SOLR_DELETES is disabled
2090
    private String deleteDraftFiles(List<String> solrDocIdsForDraftFilesToDelete) {
2091
        String deleteDraftFilesResults = "";
×
2092
        IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete);
×
2093
        deleteDraftFilesResults = indexResponse.toString();
×
2094
        return deleteDraftFilesResults;
×
2095
    }
2096

2097
    private Dataverse findRootDataverseCached() {
2098
        if (true) {
2099
            /**
2100
             * @todo Is the code below working at all? We don't want the root
2101
             * dataverse to be indexed into Solr. Specifically, we don't want a
2102
             * dataverse "card" to show up while browsing.
2103
             *
2104
             * Let's just find the root dataverse and be done with it. We'll
2105
             * figure out the caching later.
2106
             */
2107
            try {
2108
                Dataverse rootDataverse = dataverseService.findRootDataverse();
1✔
2109
                return rootDataverse;
1✔
2110
            } catch (EJBException ex) {
×
2111
                logger.info("caught " + ex);
×
2112
                Throwable cause = ex.getCause();
×
2113
                while (cause.getCause() != null) {
×
2114
                    logger.info("caused by... " + cause);
×
2115
                    cause = cause.getCause();
×
2116
                }
2117
                return null;
×
2118
            }
2119
        }
2120

2121
        /**
2122
         * @todo Why isn't this code working?
2123
         */
2124
        if (rootDataverseCached != null) {
2125
            return rootDataverseCached;
2126
        } else {
2127
            rootDataverseCached = dataverseService.findRootDataverse();
2128
            if (rootDataverseCached != null) {
2129
                return rootDataverseCached;
2130
            } else {
2131
                throw new RuntimeException("unable to determine root dataverse");
2132
            }
2133
        }
2134
    }
2135

2136
    private String getDesiredCardState(Map<DatasetVersion.VersionState, Boolean> desiredCards) {
2137
        /**
2138
         * @todo make a JVM option to enforce sanity checks? Call it dev=true?
2139
         */
2140
        boolean sanityCheck = true;
×
2141
        if (sanityCheck) {
×
2142
            Set<DatasetVersion.VersionState> expected = new HashSet<>();
×
2143
            expected.add(DatasetVersion.VersionState.DRAFT);
×
2144
            expected.add(DatasetVersion.VersionState.RELEASED);
×
2145
            expected.add(DatasetVersion.VersionState.DEACCESSIONED);
×
2146
            if (!desiredCards.keySet().equals(expected)) {
×
2147
                throw new RuntimeException("Mismatch between expected version states (" + expected + ") and version states passed in (" + desiredCards.keySet() + ")");
×
2148
            }
2149
        }
2150
        return "Desired state for existence of cards: " + desiredCards + "\n";
×
2151
    }
2152

2153
    /**
2154
     * @return Dataverses that should be reindexed either because they have
2155
     * never been indexed or their index time is before their modification time.
2156
     * (Exclude root because it is never indexed)
2157
     */
2158
    public List<Long> findStaleOrMissingDataverses() {
2159
        List<Long> staleDataverseIds = dataverseService.findIdStale();
×
2160
        Long rootId = dataverseService.findRootDataverse().getId();
×
2161
        List<Long> ids = new ArrayList<>();
×
2162
        staleDataverseIds.stream().filter(id -> (!id.equals(rootId))).forEachOrdered(id -> {
×
2163
            ids.add(id);
×
2164
        });
×
2165
        return ids;
×
2166
    }
2167

2168
    /**
2169
     * @return Datasets that should be reindexed either because they have never
2170
     * been indexed or their index time is before their modification time.
2171
     */
2172
    public List<Long> findStaleOrMissingDatasets() {
2173
        return datasetService.findIdStale();
×
2174
    }
2175

2176
  
2177
    public List<String> findDataversesInSolrOnly() throws SearchException {
2178
        try {
2179
            /**
2180
             * @todo define this centrally and statically
2181
             */
2182
            return findDvObjectInSolrOnly("dataverses");
×
2183
        } catch (SearchException ex) {
×
2184
            throw ex;
×
2185
        }
2186
    }
2187

2188
    public List<String> findDatasetsInSolrOnly() throws SearchException {
2189
        try {
2190
            /**
2191
             * @todo define this centrally and statically
2192
             */
2193
            return findDvObjectInSolrOnly("datasets");
×
2194
        } catch (SearchException ex) {
×
2195
            throw ex;
×
2196
        }
2197
    }
2198

2199
    public List<String> findFilesInSolrOnly() throws SearchException {
2200
        try {
2201
            /**
2202
             * @todo define this centrally and statically
2203
             */
2204
            return findDvObjectInSolrOnly("files");
×
2205
        } catch (SearchException ex) {
×
2206
            throw ex;
×
2207
        }
2208
    }
2209
    /**
2210
     * Finds permissions documents in Solr that don't have corresponding dvObjects
2211
     * in the database, and returns a list of their Solr "id" field.
2212
     * @return list of "id" field vales for the orphaned Solr permission documents
2213
     * @throws SearchException 
2214
     */
2215
    public List<String> findPermissionsInSolrOnly() throws SearchException {
2216
        logger.info("Checking for solr-only permissions");
×
2217
        List<String> permissionInSolrOnly = new ArrayList<>();
×
2218
        try {
2219
            int rows = 1000;
×
2220
            SolrQuery q = (new SolrQuery(SearchFields.DEFINITION_POINT_DVOBJECT_ID+":*")).setRows(rows).setSort(SortClause.asc(SearchFields.ID));
×
2221
            String cursorMark = CursorMarkParams.CURSOR_MARK_START;
×
2222
            boolean done = false;
×
2223
            while (!done) {
×
2224
                q.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
×
2225
                QueryResponse rsp = solrServer.query(q);
×
2226
                String nextCursorMark = rsp.getNextCursorMark();
×
2227
                logger.fine("Next cursor mark (1K entries): " + nextCursorMark);
×
2228
                SolrDocumentList list = rsp.getResults();
×
2229
                for (SolrDocument doc: list) {
×
2230
                    long id = Long.parseLong((String) doc.getFieldValue(SearchFields.DEFINITION_POINT_DVOBJECT_ID));
×
2231
                    String docId = (String) doc.getFieldValue(SearchFields.ID);
×
2232
                    String dtype = dvObjectService.getDtype(id);
×
2233
                    if (dtype == null) {
×
2234
                        permissionInSolrOnly.add(docId);
×
NEW
2235
                    }else if (dtype.equals(DType.Dataset.getDType())) {
×
2236
                        List<String> states = datasetService.getVersionStates(id);
×
2237
                        if (states != null) {
×
2238
                            String latestState = states.get(states.size() - 1);
×
2239
                            if (docId.endsWith("draft_permission")) {
×
2240
                                if (!latestState.equals(VersionState.DRAFT.toString())) {
×
2241
                                    permissionInSolrOnly.add(docId);
×
2242
                                }
2243
                            } else if (docId.endsWith("deaccessioned_permission")) {
×
2244
                                if (!latestState.equals(VersionState.DEACCESSIONED.toString())) {
×
2245
                                    permissionInSolrOnly.add(docId);
×
2246
                                }
2247
                            } else {
2248
                                if (!states.contains(VersionState.RELEASED.toString())) {
×
2249
                                    permissionInSolrOnly.add(docId);
×
2250
                                }
2251
                            }
2252
                        }
2253
                    } else if (dtype.equals(DType.DataFile.getDType())) {
×
2254
                        List<VersionState> states = dataFileService.findVersionStates(id);
×
2255
                        Set<String> strings = states.stream().map(VersionState::toString).collect(Collectors.toSet());
×
NEW
2256
                        logger.finest("States for " + docId + ": " + String.join(", ", strings));
×
2257
                        if (docId.endsWith("draft_permission")) {
×
2258
                            if (!states.contains(VersionState.DRAFT)) {
×
2259
                                permissionInSolrOnly.add(docId);
×
2260
                            }
2261
                        } else if (docId.endsWith("deaccessioned_permission")) {
×
2262
                            if (!states.contains(VersionState.DEACCESSIONED) && states.size() == 1) {
×
2263
                                permissionInSolrOnly.add(docId);
×
2264
                            }
2265
                        } else {
2266
                            if (!states.contains(VersionState.RELEASED)) {
×
2267
                                permissionInSolrOnly.add(docId);
×
2268
                            } else {
2269
                                if (!dataFileService.isInReleasedVersion(id)) {
×
NEW
2270
                                    logger.finest("Adding doc " + docId + " to list of permissions in Solr only");
×
2271
                                    permissionInSolrOnly.add(docId);
×
2272
                                }
2273
                            }
2274

2275
                        }
2276
                    }
2277
                }
×
2278
                if (cursorMark.equals(nextCursorMark)) {
×
2279
                    done = true;
×
2280
                }
2281
                cursorMark = nextCursorMark;
×
2282
            }
×
2283
        } catch (SolrServerException | IOException ex) {
×
2284
           throw new SearchException("Error searching Solr for permissions" , ex);
×
2285
 
2286
        } catch (Exception e) {
×
2287
            logger.warning(e.getLocalizedMessage());
×
2288
            e.printStackTrace();
×
2289
        }
×
2290
        return permissionInSolrOnly;
×
2291
    }
2292
    
2293
    private List<String> findDvObjectInSolrOnly(String type) throws SearchException {
2294
        SolrQuery solrQuery = new SolrQuery();
×
2295
        int rows = 100;
×
2296
     
2297
        solrQuery.setQuery("*").setRows(rows).setSort(SortClause.asc(SearchFields.ID));
×
2298
        solrQuery.addFilterQuery(SearchFields.TYPE + ":" + type);
×
2299
        List<String> dvObjectInSolrOnly = new ArrayList<>();
×
2300
       
2301
        String cursorMark = CursorMarkParams.CURSOR_MARK_START;
×
2302
        boolean done = false;
×
2303
        while (!done) {
×
2304
            solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
×
2305
            QueryResponse rsp = null;
×
2306
            try {
2307
                rsp = solrServer.query(solrQuery);
×
2308
             } catch (SolrServerException | IOException ex) {
×
2309
                throw new SearchException("Error searching Solr type: " + type, ex);
×
2310

2311
            }
×
2312
            String nextCursorMark = rsp.getNextCursorMark();
×
2313
            SolrDocumentList list = rsp.getResults();
×
2314
            for (SolrDocument doc: list) {
×
2315
                Object idObject = doc.getFieldValue(SearchFields.ENTITY_ID);
×
2316
                if (idObject != null) {
×
2317
                    try {
2318
                        long id = (Long) idObject;
×
2319
                        if (dvObjectService.getDtype(id) == null) {
×
2320
                            dvObjectInSolrOnly.add((String)doc.getFieldValue(SearchFields.ID));
×
2321
                        }
2322
                    } catch (ClassCastException ex) {
×
2323
                        throw new SearchException("Found " + SearchFields.ENTITY_ID + " but error casting " + idObject + " to long", ex);
×
2324
                    }
×
2325
                }
2326
            }
×
2327
            if (cursorMark.equals(nextCursorMark)) {
×
2328
                done = true;
×
2329
            }
2330
            cursorMark = nextCursorMark;
×
2331
        }
×
2332

2333
        return dvObjectInSolrOnly;
×
2334
    }
2335

2336
    private List<String> findFilesOfParentDataset(long parentDatasetId) throws SearchException {
2337
        SolrQuery solrQuery = new SolrQuery();
×
2338
        solrQuery.setQuery("*");
×
2339
        solrQuery.setRows(Integer.MAX_VALUE);
×
2340
        solrQuery.addFilterQuery(SearchFields.PARENT_ID + ":" + parentDatasetId);
×
2341
        /**
2342
         * @todo "files" should be a constant
2343
         */
2344
        solrQuery.addFilterQuery(SearchFields.TYPE + ":" + "files");
×
2345
        List<String> dvObjectInSolrOnly = new ArrayList<>();
×
2346
        QueryResponse queryResponse = null;
×
2347
        try {
2348
            queryResponse = solrClientService.getSolrClient().query(solrQuery);
×
2349
        } catch (SolrServerException | IOException ex) {
×
2350
            throw new SearchException("Error searching Solr for dataset parent id " + parentDatasetId, ex);
×
2351
        }
×
2352
        SolrDocumentList results = queryResponse.getResults();
×
2353
        for (SolrDocument solrDocument : results) {
×
2354
            Object idObject = solrDocument.getFieldValue(SearchFields.ID);
×
2355
            if (idObject != null) {
×
2356
                String id = (String) idObject;
×
2357
                dvObjectInSolrOnly.add(id);
×
2358
            }
2359
        }
×
2360
        return dvObjectInSolrOnly;
×
2361
    }
2362

2363
    // This is a convenience method for deleting all the SOLR documents
2364
    // (Datasets and DataFiles) harvested by a specific HarvestingClient.
2365
    // The delete logic is a bit simpler, than when deleting "real", local
2366
    // datasets and files - for example, harvested datasets are never Drafts, etc.
2367
    // We are also less concerned with the diagnostics; if any of it fails,
2368
    // we don't need to treat it as a fatal condition.
2369
    public void deleteHarvestedDocuments(HarvestingClient harvestingClient) {
2370
        List<String> solrIdsOfDatasetsToDelete = new ArrayList<>();
×
2371

2372
        // I am going to make multiple solrIndexService.deleteMultipleSolrIds() calls;
2373
        // one call for the list of datafiles in each dataset; then one more call to
2374
        // delete all the dataset documents.
2375
        // I'm *assuming* this is safer than to try and make one complete list of
2376
        // all the documents (datasets and datafiles), and then attempt to delete
2377
        // them all at once... (is there a limit??) The list can be huge - if the
2378
        // harvested archive is on the scale of Odum or ICPSR, with thousands of
2379
        // datasets and tens of thousands of files.
2380
        //
2381
        for (Dataset harvestedDataset : harvestingClient.getHarvestedDatasets()) {
×
2382
            solrIdsOfDatasetsToDelete.add(solrDocIdentifierDataset + harvestedDataset.getId());
×
2383

2384
            List<String> solrIdsOfDatafilesToDelete = new ArrayList<>();
×
2385
            for (DataFile datafile : harvestedDataset.getFiles()) {
×
2386
                solrIdsOfDatafilesToDelete.add(solrDocIdentifierFile + datafile.getId());
×
2387
            }
×
2388
            logger.fine("attempting to delete the following datafiles from the index: " + StringUtils.join(solrIdsOfDatafilesToDelete, ","));
×
2389
            IndexResponse resultOfAttemptToDeleteFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfDatafilesToDelete);
×
2390
            logger.fine("result of an attempted delete of the harvested files associated with the dataset " + harvestedDataset.getId() + ": " + resultOfAttemptToDeleteFiles);
×
2391

2392
        }
×
2393

2394
        logger.fine("attempting to delete the following datasets from the index: " + StringUtils.join(solrIdsOfDatasetsToDelete, ","));
×
2395
        IndexResponse resultOfAttemptToDeleteDatasets = solrIndexService.deleteMultipleSolrIds(solrIdsOfDatasetsToDelete);
×
2396
        logger.fine("result of attempt to delete harvested datasets associated with the client: " + resultOfAttemptToDeleteDatasets + "\n");
×
2397

2398
    }
×
2399

2400
    // Another convenience method, for deleting all the SOLR documents (dataset_
2401
    // and datafile_s) associated with a harveste dataset. The comments for the
2402
    // method above apply here too.
2403
    public void deleteHarvestedDocuments(Dataset harvestedDataset) {
2404
        List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
×
2405
        solrIdsOfDocumentsToDelete.add(solrDocIdentifierDataset + harvestedDataset.getId());
×
2406

2407
        for (DataFile datafile : harvestedDataset.getFiles()) {
×
2408
            solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + datafile.getId());
×
2409
        }
×
2410

2411
        deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
×
2412
    }
×
2413
    
2414
    public void deleteHarvestedDocuments(List<String> solrIdsOfDocumentsToDelete) {
2415

2416
        logger.fine("attempting to delete the following documents from the index: " + StringUtils.join(solrIdsOfDocumentsToDelete, ","));
×
2417
        IndexResponse resultOfAttemptToDeleteDocuments = solrIndexService.deleteMultipleSolrIds(solrIdsOfDocumentsToDelete);
×
2418
        logger.fine("result of attempt to delete harvested documents: " + resultOfAttemptToDeleteDocuments + "\n");
×
2419
    }
×
2420

2421
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc