• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kermitt2 / grobid / 398

pending completion
398

push

circleci

more robustness in case of reference segmenter deficiency

2 of 2 new or added lines in 1 file covered. (100.0%)

14848 of 37500 relevant lines covered (39.59%)

0.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

51.39
/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java
1
package org.grobid.core.engines;
2

3
import com.google.common.base.Function;
4
import com.google.common.collect.Lists;
5
import com.google.common.collect.Sets;
6
import org.grobid.core.GrobidModels;
7
import org.grobid.core.document.Document;
8
import org.grobid.core.document.DocumentPiece;
9
import org.grobid.core.document.DocumentPointer;
10
import org.grobid.core.engines.citations.LabeledReferenceResult;
11
import org.grobid.core.engines.citations.ReferenceSegmenter;
12
import org.grobid.core.engines.label.SegmentationLabels;
13
import org.grobid.core.engines.label.TaggingLabels;
14
import org.grobid.core.engines.tagging.GenericTaggerUtils;
15
import org.grobid.core.engines.tagging.GrobidCRFEngine;
16
import org.grobid.core.exceptions.GrobidException;
17
import org.grobid.core.features.FeatureFactory;
18
import org.grobid.core.features.FeaturesVectorReferenceSegmenter;
19
import org.grobid.core.layout.Block;
20
import org.grobid.core.layout.LayoutToken;
21
import org.grobid.core.tokenization.LabeledTokensContainer;
22
import org.grobid.core.tokenization.TaggingTokenSynchronizer;
23
import org.grobid.core.utilities.BoundingBoxCalculator;
24
import org.grobid.core.utilities.GrobidProperties;
25
import org.grobid.core.utilities.TextUtilities;
26
import org.grobid.core.utilities.Triple;
27
import org.slf4j.Logger;
28
import org.slf4j.LoggerFactory;
29

30
import java.util.ArrayList;
31
import java.util.Collections;
32
import java.util.Iterator;
33
import java.util.List;
34
import java.util.SortedSet;
35
import java.util.regex.Matcher;
36

37
import org.apache.commons.lang3.tuple.Pair;
38

39
public class ReferenceSegmenterParser extends AbstractParser implements ReferenceSegmenter {
40
        private static final Logger LOGGER = LoggerFactory.getLogger(ReferenceSegmenterParser.class);
1✔
41

42
    // projection scale for line length
43
    private static final int LINESCALE = 10;
44

45
    protected ReferenceSegmenterParser() {
46
        super(GrobidModels.REFERENCE_SEGMENTER);
1✔
47
    }
1✔
48

49
        @Override
50
        public List<LabeledReferenceResult> extract(String referenceBlock) {
51
                Document res = Document.createFromText(referenceBlock);
×
52

53
                DocumentPiece piece = new DocumentPiece(
×
54
                                new DocumentPointer(0, 0, 0),
55
                                new DocumentPointer(0, res.getTokenizations().size() - 1, res.getTokenizations().size() - 1));
×
56

57
                return extract(res, Sets.newTreeSet(Collections.singletonList(piece)), false);
×
58
        }
59

60
        /**
61
     *
62
     * @param doc Document object
63
     * @return <reference_label, reference_string>  Note, that label is null when no label was detected
64
     *              example: <"[1]", "Hu W., Barkana, R., &amp; Gruzinov A. Phys. Rev. Lett. 85, 1158">
65
     */
66
        public List<LabeledReferenceResult> extract(Document doc) {
67
                return extract(doc, false);
1✔
68
        }
69

70
        public List<LabeledReferenceResult> extract(Document doc, boolean training) {
71
                SortedSet<DocumentPiece> referencesParts = doc.getDocumentPart(SegmentationLabels.REFERENCES);
1✔
72
                return extract(doc, referencesParts, training);
1✔
73
        }
74

75
    public List<LabeledReferenceResult> extract(Document doc, SortedSet<DocumentPiece> referencesParts, boolean training) {
76

77
                Pair<String,List<LayoutToken>> featSeg = getReferencesSectionFeatured(doc, referencesParts);
1✔
78
                String res;
79
                List<LayoutToken> tokenizationsReferences;
80
                if (featSeg == null) {
1✔
81
                        return null;
×
82
                }
83
                // if featSeg is null, it usually means that no reference segment is found in the
84
                // document segmentation
85
                String featureVector = featSeg.getLeft();
1✔
86
                tokenizationsReferences = featSeg.getRight();
1✔
87
                try {
88
                        
89
                        // to support long sequence in case of RNN usage we segment in pieces of less than the 
90
                        // max_sequence_length and quite significantly overlapping 
91
                        // this does not apply to CRF which can process "infinite" input sequence
92
                        // this is relevant to the reference segmenter RNN model, which is position-free in its 
93
                        // application, but could not be generalized to other RNN or transformer model long inputs
94
                        if (GrobidProperties.getGrobidCRFEngine(GrobidModels.REFERENCE_SEGMENTER) == GrobidCRFEngine.DELFT) {
1✔
95
                                String[] featureVectorLines = featureVector.split("\n");
×
96

97
/*for(LayoutToken token : tokenizationsReferences) {
98
System.out.print(token.getText());
99
}
100
System.out.println("\n");
101
System.out.println("total input lines: " + featureVectorLines.length + " - " + tokenizationsReferences.size() + " tokens");*/
102

103
                                int originalMaxSequence = 2000;
×
104
                                if (GrobidProperties.getInstance().getDelftRuntimeMaxSequenceLength(GrobidModels.REFERENCE_SEGMENTER.getModelName()) != -1) {
×
105
                                        originalMaxSequence = GrobidProperties.getInstance().getDelftRuntimeMaxSequenceLength(GrobidModels.REFERENCE_SEGMENTER.getModelName());
×
106
                                }
107

108
                                if (featureVectorLines.length < originalMaxSequence || originalMaxSequence < 600) {
×
109
                                        // if the input is lower than max sequence length, not need to segment
110
                                        // if the max sequence length is too small, e.g. transformer, we won't be able to manage 
111
                                        // overlaps adapted to references
112
                                        res = label(featureVector);
×
113
                                } else {
114
                                        // we adjust max sequence value to take into account 500 token lines overlap
115
                                        int maxSequence = Math.max(500, originalMaxSequence - 1000);
×
116

117
//System.out.println("originalMaxSequence: " + originalMaxSequence + " / maxSequence adjusted to: " + maxSequence);
118

119
                                        List<List<String>> featureVectorPieces = new ArrayList<>();
×
120
                                        // segment the input vectors in overlapping sequences, according to the model max_sequence_length parameter
121
                                        for(int i=0; (i*maxSequence) < featureVectorLines.length; i++) {
×
122
                                                int lowerBound = i*maxSequence;
×
123
                                                // overlapping: this localRes has 500 extra lines after the normal end
124
                                                int upperBound = Math.min( ((i+1)*maxSequence)+500, featureVectorLines.length );
×
125
                                                if (featureVectorLines.length - lowerBound < originalMaxSequence)
×
126
                                                        upperBound = featureVectorLines.length;
×
127
                                                
128
//System.out.println("lowerBound: " + lowerBound + " - upperBound: " + upperBound);
129
                                                List<String> featureVectorPiece = new ArrayList<>();
×
130
                                                for(int j=lowerBound; j<upperBound; j++)
×
131
                                                        featureVectorPiece.add(featureVectorLines[j]);
×
132
                                                featureVectorPieces.add(featureVectorPiece);
×
133

134
                                                if (upperBound == featureVectorLines.length)
×
135
                                                        break;
×
136
                                        }
137

138
/*System.out.println("featureVectorPieces.size(): " + featureVectorPieces.size());
139
for(List<String> featureVectorPiece : featureVectorPieces) {
140
System.out.println(featureVectorPiece.size());
141
}*/
142
                                        // label every pieces in batch
143
                                        List<String> allRes = new ArrayList<>();
×
144
                                        List<String> allVectors = new ArrayList<>();
×
145
                                        for(List<String> featureVectorPiece : featureVectorPieces) {
×
146
                                                StringBuilder localFeatureVector = new StringBuilder();
×
147
                                                for(int j=0; j<featureVectorPiece.size(); j++) {
×
148
                                                        localFeatureVector.append(featureVectorPiece.get(j)).append("\n");
×
149
                                                }
150
                                                allVectors.add(localFeatureVector.toString());
×
151
                                        }
×
152
                                        
153
                                        // parallel labeling of the input segments
154
                                        String fullRes = label(allVectors);
×
155

156
                                        // segment this result to get back the input chunk alignment (with extra 500 overlaping lines) 
157
                                        String[] fullResLines = fullRes.split("\n");
×
158
                                        int pos = 0;
×
159
                                        for(List<String> featureVectorPiece : featureVectorPieces) {
×
160
                                                StringBuilder localRes = new StringBuilder();
×
161
                                                int localSize = featureVectorPiece.size();
×
162
                                                for(int i=pos; i<pos+localSize; i++) {
×
163
                                                        localRes.append(fullResLines[i]).append("\n");
×
164
                                                }
165
                                                allRes.add(localRes.toString());
×
166
                                                pos += localSize;
×
167
                                        }
×
168
                                        
169
                                        // combine results and reconnect smoothly overlaps 
170
                                        StringBuilder resBuilder = new StringBuilder();
×
171
                                        int previousTransitionPos = 0;
×
172
                                        for(int i=0; i<allRes.size(); i++) {
×
173
                                                String localRes = allRes.get(i);
×
174
                                                String[] localResLines = localRes.split("\n");
×
175
//System.out.println("localResLines.length: " + localResLines.length);
176
                                                int transitionPos = localResLines.length;
×
177
                                                if (i != allRes.size()-1) {
×
178
                                                        // in the trailing redundant part (500 last lines), we identify the line index 
179
                                                        // of the last "closing" label, this is the point where we will reconnect the 
180
                                                        // labeled segments to avoid breaking a labeled field
181
                                                        
182
                                                        for(int k=localResLines.length-1; k>=0; k--) {
×
183
                                                                if (localResLines.length-k == 500) {
×
184
                                                                        // this is the max overlap, we don't go beyond! 
185
                                                                        transitionPos = k;
×
186
                                                                        break;
×
187
                                                                }
188

189
                                                                String line = localResLines[k];
×
190
                                                                if (line.endsWith(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<label>") || 
×
191
                                                                        line.endsWith(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<reference>")) {
×
192
                                                                        // we can stop the line before this one
193
                                                                        transitionPos = k;
×
194
                                                                        break;
×
195
                                                                }
196
                                                        }
197
                                                } 
198
                                                // else: we are at the last chunk, so we take the content until the very end
199

200
//System.out.println("previousTransitionPos: " + previousTransitionPos);
201
//System.out.println("transitionPos: " + transitionPos + "\n");
202

203
                                                List<String> selectedlocalResLines = new ArrayList<>();
×
204
                                                for(int j= previousTransitionPos; j<transitionPos; j++) {
×
205
                                                        if (j == previousTransitionPos && previousTransitionPos != 0) {
×
206
                                                                // we want to be sure to have a starting label
207
                                                                String localLine = localResLines[j];
×
208
                                                                if (localLine.indexOf(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX) == -1) {
×
209
                                                                        localLine = localLine.replace("<label>", TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<label>");
×
210
                                                                        localLine = localLine.replace("<reference>", TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<reference>");
×
211
                                                                }
212
                                                                selectedlocalResLines.add(localLine);
×
213
                                                        } else if (j == previousTransitionPos && previousTransitionPos == 0 && i != 0) {
×
214
                                                                // previousTransitionPos is 0 and we are not at the first segment: we had a non overlapping
215
                                                                // transition, we might want to avoid a starting label at this point 
216
                                                                String localLine = localResLines[j];
×
217
                                                                if (localLine.indexOf(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX) != -1) {
×
218
                                                                        localLine = localLine.replace(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<label>", "<label>");
×
219
                                                                        localLine = localLine.replace(TaggingLabels.GROBID_START_ENTITY_LABEL_PREFIX+"<reference>", "<reference>");
×
220
                                                                }
221
                                                                selectedlocalResLines.add(localLine);
×
222
                                                        } else {
×
223
                                                                selectedlocalResLines.add(localResLines[j]);
×
224
                                                        }
225
                                                }
226
                                                for(String localResLine : selectedlocalResLines)
×
227
                                                        resBuilder.append(localResLine).append("\n");
×
228
                                                
229
                                                previousTransitionPos = transitionPos-maxSequence;
×
230
                                        }
231
                                        res = resBuilder.toString();
×
232
                                }
233
                        } else
×
234
                                res = label(featureVector);
1✔
235
                }
236
                catch(Exception e) {
×
237
                        throw new GrobidException("Labeling in ReferenceSegmenter fails.", e);
×
238
                }
1✔
239
                if (res == null) {
1✔
240
                        return null;
×
241
                }
242
        
243
        // if we extract for generating training data, we also give back the used features
244
        List<Triple<String, String, String>> labeled = GenericTaggerUtils.getTokensWithLabelsAndFeatures(res, training);
1✔
245

246
        return getExtractionResult(tokenizationsReferences, labeled);
1✔
247
    }
248

249
    private List<LabeledReferenceResult> getExtractionResult(List<LayoutToken> tokenizations, List<Triple<String, String, String>> labeled) {
250
        final List<LabeledReferenceResult> resultList = new ArrayList<>();
1✔
251
        final StringBuilder reference = new StringBuilder();
1✔
252
        final List<LayoutToken> referenceTokens = new ArrayList<>();
1✔
253
        final StringBuilder features = new StringBuilder();
1✔
254
        final StringBuilder referenceLabel = new StringBuilder();
1✔
255

256
        TaggingTokenSynchronizer synchronizer = new TaggingTokenSynchronizer(null, labeled, tokenizations);
1✔
257

258
        Function<LabeledTokensContainer, Void> function = new Function<LabeledTokensContainer, Void>() {
1✔
259
            @Override public Void apply(LabeledTokensContainer container) {
260
                features.append(container.getFeatureString());
1✔
261
                features.append('\n');
1✔
262
                if (container.isBeginning()) {
1✔
263
                    if (reference.length() != 0) {
1✔
264
                        resultList.add(new LabeledReferenceResult(referenceLabel.length() == 0 ? null :
1✔
265
                            referenceLabel.toString().trim(), reference.toString().trim(), Lists.newArrayList(referenceTokens),
1✔
266
                                    features.toString(), BoundingBoxCalculator.calculate(referenceTokens)));
1✔
267
                        reference.setLength(0);
1✔
268
                        referenceLabel.setLength(0);
1✔
269
                        features.setLength(0);
1✔
270
                        referenceTokens.clear();
1✔
271
                    }
272
                }
273
                return null;
1✔
274
            }
275
        };
276

277
        Iterator<LabeledTokensContainer> iterator = synchronizer.iterator();
1✔
278
        while (iterator.hasNext()) {
1✔
279
            LabeledTokensContainer container = iterator.next();
1✔
280
            if (container == null)
1✔
281
                continue;
×
282
            String tok = container.getToken();
1✔
283
            String plainLabel = container.getPlainLabel();
1✔
284
            if ("<label>".equals(plainLabel)) {
1✔
285
                function.apply(container);
1✔
286
                referenceLabel.append(tok);
1✔
287

288
                if (container.isTrailingSpace() || container.isTrailingNewLine()) {
1✔
289
                    referenceLabel.append(' ');
1✔
290
                }
291
            } else if (plainLabel.equals("<reference>")) {
1✔
292
                function.apply(container);
1✔
293
                reference.append(tok);
1✔
294

295
                if (container.isTrailingSpace()) {
1✔
296
                    reference.append(' ');
1✔
297
                }
298
                if (container.isTrailingNewLine()) {
1✔
299
                    reference.append('\n');
1✔
300
                }
301

302
                referenceTokens.addAll(container.getLayoutTokens());
1✔
303
            } else if (plainLabel.equals("<other>")) {
1✔
304
                // NOP
305
            }
306

307
            // Handle last one.
308
            if (!iterator.hasNext()) {
1✔
309
                resultList.add(new LabeledReferenceResult(referenceLabel.length() == 0 ? null :
1✔
310
                    referenceLabel.toString().trim(), reference.toString().trim(),
1✔
311
                    referenceTokens, features.toString(),
1✔
312
                    BoundingBoxCalculator.calculate(referenceTokens)));
1✔
313
                reference.setLength(0);
1✔
314
                referenceLabel.setLength(0);
1✔
315
            }
316
        }
1✔
317

318
        return resultList;
1✔
319
    }
320

321
        public Pair<String,String> createTrainingData(Document doc, int id) {
322
                SortedSet<DocumentPiece> referencesParts = doc.getDocumentPart(SegmentationLabels.REFERENCES);
×
323
                Pair<String,List<LayoutToken>> featSeg = getReferencesSectionFeatured(doc, referencesParts);
×
324
                String res;
325
                List<LayoutToken> tokenizations;
326
                if (featSeg == null) {
×
327
                        return null;
×
328
                }
329
                // if featSeg is null, it usually means that no reference segment is found in the
330
                // document segmentation
331
                String featureVector = featSeg.getLeft();
×
332
                tokenizations = featSeg.getRight();
×
333
                try {
334
                        res = label(featureVector);
×
335
                }
336
                catch(Exception e) {
×
337
                        throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
×
338
                }
×
339
                if (res == null) {
×
340
                        return null;
×
341
                }
342
        List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(res);
×
343
        StringBuilder sb = new StringBuilder();
×
344

345
                //noinspection StringConcatenationInsideStringBufferAppend
346
                sb.append("<tei xml:space=\"preserve\">\n" +
×
347
                                "    <teiHeader>\n" +
348
                                "        <fileDesc xml:id=\"_" + id + "\"/>\n" +
349
                                "    </teiHeader>\n" +
350
                                "    <text xml:lang=\"en\">\n" +
351
                                "        <listBibl>\n");
352

353
                int tokPtr = 0;
×
354
                boolean addSpace = false;
×
355
                boolean addEOL = false;
×
356
                String lastTag = null;
×
357
                boolean refOpen = false;
×
358
                for (Pair<String, String> l : labeled) {
×
359
            String tok = l.getLeft();
×
360
            String label = l.getRight();
×
361

362
                        int tokPtr2 = tokPtr;
×
363
            for(; tokPtr2 < tokenizations.size(); tokPtr2++) {
×
364
                if (tokenizations.get(tokPtr2).t().equals(" ")) {
×
365
                                        addSpace = true;
×
366
                                }
367
                                else if (tokenizations.get(tokPtr2).t().equals("\n") ||
×
368
                                             tokenizations.get(tokPtr).t().equals("\r") ) {
×
369
                                        addEOL = true;
×
370
                                }
371
                else {
372
                                        break;
373
                                }
374
            }
375
                        tokPtr = tokPtr2;
×
376

377
            if (tokPtr >= tokenizations.size()) {
×
378
                                LOGGER.error("Implementation error: Reached the end of tokenizations, but current token is " + tok);
×
379
                                // we add a space to avoid concatenated text
380
                                addSpace = true;
×
381
            }
382
            else {
383
                                String tokenizationToken = tokenizations.get(tokPtr).getText();
×
384

385
                                if ((tokPtr != tokenizations.size()) && !tokenizationToken.equals(tok)) {
×
386
                                        // and we add a space by default to avoid concatenated text
387
                                        addSpace = true;
×
388
                                        if (!tok.startsWith(tokenizationToken)) {
×
389
                                                // this is a very exceptional case due to a sequence of accent/diacresis, in this case we skip
390
                                                // a shift in the tokenizations list and continue on the basis of the labeled token
391
                                                // we check one ahead
392
                                                tokPtr++;
×
393
                                                tokenizationToken = tokenizations.get(tokPtr).getText();
×
394
                                                if (!tok.equals(tokenizationToken)) {
×
395
                                                        // we try another position forward (second hope!)
396
                                                        tokPtr++;
×
397
                                                        tokenizationToken = tokenizations.get(tokPtr).getText();
×
398
                                                        if (!tok.equals(tokenizationToken)) {
×
399
                                                                // we try another position forward (last hope!)
400
                                                                tokPtr++;
×
401
                                                                tokenizationToken = tokenizations.get(tokPtr).getText();
×
402
                                                                if (!tok.equals(tokenizationToken)) {
×
403
                                                                        // we return to the initial position
404
                                                                        tokPtr = tokPtr-3;
×
405
                                                                        tokenizationToken = tokenizations.get(tokPtr).getText();
×
406
                                                                        LOGGER.error("Implementation error, tokens out of sync: " +
×
407
                                                                                tokenizationToken + " != " + tok + ", at position " + tokPtr);
408
                                                                }
409
                                                        }
410
                                                }
411
                                        }
412
                                        // note: if the above condition is true, this is an exceptional case due to a
413
                                        // sequence of accent/diacresis and we can go on as a full string match
414
                    }
415
                        }
416

417
                        String plainLabel = GenericTaggerUtils.getPlainLabel(label);
×
418

419
                        boolean tagClosed = (lastTag != null) && testClosingTag(sb, label, lastTag, addSpace, addEOL);
×
420

421
                        if (tagClosed) {
×
422
                                addSpace = false;
×
423
                                addEOL = false;
×
424
                        }
425
                        if (tagClosed && lastTag.equals("<reference>")) {
×
426
                                refOpen = false;
×
427
                        }
428
                        String output;
429
                        String field;
430
                        if (refOpen) {
×
431
                                field = "<label>";
×
432
                        }
433
                        else {
434
                                field = "<bibl><label>";
×
435
                        }
436
                        output = writeField(label, lastTag, tok, "<label>", field, addSpace, addEOL, 2);
×
437
                        if (output != null) {
×
438
                                sb.append(output);
×
439
                                refOpen = true;
×
440
                        }
441
                        else {
442
                                if (refOpen) {
×
443
                                        field = "";
×
444
                                }
445
                                else {
446
                                        field = "<bibl>";
×
447
                                }
448
                                output = writeField(label, lastTag, tok, "<reference>", field, addSpace, addEOL, 2);
×
449
                                if (output != null) {
×
450
                                        sb.append(output);
×
451
                                        refOpen= true;
×
452
                                }
453
                                else {
454
                                        output = writeField(label, lastTag, tok, "<other>", "", addSpace, addEOL, 2);
×
455
                                        if (output != null) {
×
456
                                                sb.append(output);
×
457
                                                refOpen = false;
×
458
                                        }
459
                                }
460
                        }
461

462
                        lastTag = plainLabel;
×
463
                        addSpace = false;
×
464
                        addEOL = false;
×
465
            tokPtr++;
×
466
        }
×
467

468
                if (refOpen) {
×
469
                        sb.append("</bibl>");
×
470
                }
471

472
        sb.append("\n        </listBibl>\n" +
×
473
                "    </text>\n" +
474
                "</tei>\n");
475

476
                return Pair.of(sb.toString(), featureVector);
×
477
    }
478

479

480
        private boolean testClosingTag(StringBuilder buffer,
481
                                   String currentTag,
482
                                   String lastTag,
483
                                                                   boolean addSpace,
484
                                                                   boolean addEOL) {
485
        boolean res = false;
×
486
        if (!currentTag.equals(lastTag)) {
×
487
            res = true;
×
488
            // we close the current tag
489
            if (lastTag.equals("<other>")) {
×
490
                                if (addEOL)
×
491
                    buffer.append("<lb/>");
×
492
                                if (addSpace)
×
493
                    buffer.append(" ");
×
494
                buffer.append("\n");
×
495
            } else if (lastTag.equals("<label>")) {
×
496
                                buffer.append("</label>");
×
497
                                if (addEOL)
×
498
                    buffer.append("<lb/>");
×
499
                                if (addSpace)
×
500
                    buffer.append(" ");
×
501
            } else if (lastTag.equals("<reference>")) {
×
502
                                if (addEOL)
×
503
                    buffer.append("<lb/>");
×
504
                                if (addSpace)
×
505
                    buffer.append(" ");
×
506
                buffer.append("</bibl>\n");
×
507
            } else {
508
                res = false;
×
509
            }
510
        }
511
        return res;
×
512
    }
513

514
    private String writeField(String currentTag,
515
                              String lastTag,
516
                              String token,
517
                              String field,
518
                              String outField,
519
                              boolean addSpace,
520
                                                          boolean addEOL,
521
                                                          int nbIndent) {
522
        String result = null;
×
523
        if (currentTag.endsWith(field)) {
×
524
            if (currentTag.endsWith("<other>")) {
×
525
                result = "";
×
526
                                if (currentTag.equals("I-<other>")) {
×
527
                                        result += "\n";
×
528
                                        for (int i = 0; i < nbIndent; i++) {
×
529
                            result += "    ";
×
530
                        }
531
                                }
532
                                if (addEOL)
×
533
                    result += "<lb/>";
×
534
                                if (addSpace)
×
535
                    result += " ";
×
536
                result += TextUtilities.HTMLEncode(token);
×
537
            }
538
                        else if ((lastTag != null) && currentTag.endsWith(lastTag)) {
×
539
                result = "";
×
540
                                if (addEOL)
×
541
                    result += "<lb/>";
×
542
                                if (addSpace)
×
543
                    result += " ";
×
544
                                if (currentTag.startsWith("I-"))
×
545
                                        result += outField;
×
546
                result += TextUtilities.HTMLEncode(token);
×
547
            }
548
                        else {
549
                result = "";
×
550
                                if (outField.length() > 0) {
×
551
                                        for (int i = 0; i < nbIndent; i++) {
×
552
                            result += "    ";
×
553
                        }
554
                                }
555
                                if (addEOL)
×
556
                    result += "<lb/>";
×
557
                if (addSpace)
×
558
                    result += " ";
×
559
                result += outField + TextUtilities.HTMLEncode(token);
×
560
            }
561
        }
562
        return result;
×
563
    }
564

565
        static public Pair<String,List<LayoutToken>> getReferencesSectionFeatured(Document doc,
566
                                                                                                SortedSet<DocumentPiece> referencesParts) {
567
                if ((referencesParts == null) || (referencesParts.size() == 0)) {
1✔
568
                        return null;
×
569
                }
570
                FeatureFactory featureFactory = FeatureFactory.getInstance();
1✔
571
                List<Block> blocks = doc.getBlocks();
1✔
572
                if ( (blocks == null) || blocks.size() == 0) {
1✔
573
                        return null;
×
574
                }
575

576
                StringBuilder citations = new StringBuilder();
1✔
577
        boolean newline;
578
        int n; // overall token number
579

580
                FeaturesVectorReferenceSegmenter features;
581
                FeaturesVectorReferenceSegmenter previousFeatures = null;
1✔
582
                boolean endblock;
583
                boolean startblock;
584
        //int mm = 0; // token position in the sentence
585
        int nn; // token position in the line
586
                double lineStartX = Double.NaN;
1✔
587
                boolean indented = false;
1✔
588

589
                List<LayoutToken> tokenizationsReferences = new ArrayList<LayoutToken>();
1✔
590
                List<LayoutToken> tokenizations = doc.getTokenizations();
1✔
591

592
                int maxLineLength = 1;
1✔
593
                //List<Integer> lineLengths = new ArrayList<Integer>();
594
                int currentLineLength = 0;
1✔
595
                //int lineIndex = 0;
596

597
        // we calculate current max line length and intialize the body tokenization structure
598
                for(DocumentPiece docPiece : referencesParts) {
1✔
599
                        DocumentPointer dp1 = docPiece.getLeft();
1✔
600
                        DocumentPointer dp2 = docPiece.getRight();
1✔
601

602
            int tokens = dp1.getTokenDocPos();
1✔
603
            int tokene = dp2.getTokenDocPos();
1✔
604
            for (int i = tokens; i <= tokene; i++) {
1✔
605
                tokenizationsReferences.add(tokenizations.get(i));
1✔
606
                                currentLineLength += tokenizations.get(i).getText().length();
1✔
607
                                if (tokenizations.get(i).t().equals("\n") || tokenizations.get(i).t().equals("\r") ) {
1✔
608
                                        //lineLengths.add(currentLineLength);
609
                                        if (currentLineLength > maxLineLength)
1✔
610
                                                maxLineLength = currentLineLength;
1✔
611
                                        currentLineLength = 0;
1✔
612
                                }
613
            }
614
                }
1✔
615

616
                for(DocumentPiece docPiece : referencesParts) {
1✔
617
                        DocumentPointer dp1 = docPiece.getLeft();
1✔
618
                        DocumentPointer dp2 = docPiece.getRight();
1✔
619

620
/*for(int i=dp1.getTokenDocPos(); i<dp2.getTokenDocPos(); i++) {
621
        System.out.print(tokenizations.get(i));
622
}        
623
System.out.println("");
624
*/
625
                        //currentLineLength = lineLengths.get(lineIndex);
626
                        nn = 0;
1✔
627
                        int tokenIndex = 0;
1✔
628
                        int blockIndex = dp1.getBlockPtr();
1✔
629
                        Block block = null;
1✔
630
                        List<LayoutToken> tokens;
631
                        boolean previousNewline = true;
1✔
632
                        currentLineLength = 0;
1✔
633
                        String currentLineProfile = null;
1✔
634
            for (n = dp1.getTokenDocPos(); n <= dp2.getTokenDocPos(); n++) {
1✔
635
                String text = tokenizations.get(n).getText();
1✔
636

637
                                if (text == null) {
1✔
638
                                        continue;
×
639
                                }
640

641
                                // set corresponding block
642
                                if ( (block != null) && (n > block.getEndToken()) ) {
1✔
643
                                        blockIndex++;
1✔
644
                                        tokenIndex = 0;
1✔
645
                                        currentLineLength = 0;
1✔
646
                                        currentLineProfile = null;
1✔
647
                                }
648

649
                                if (blockIndex<blocks.size()) {
1✔
650
                                        block = blocks.get(blockIndex);
1✔
651
                                        if (n == block.getStartToken()) {
1✔
652
                                                startblock = true;
1✔
653
                                                endblock = false;
1✔
654
                                        }
655
                                        else if (n == block.getEndToken()) {
1✔
656
                                                startblock = false;
1✔
657
                                                endblock = true;
1✔
658
                                        }
659
                                        else {
660
                                                startblock = false;
1✔
661
                                                endblock = false;
1✔
662
                                        }
663
                                }
664
                                else {
665
                                        block = null;
×
666
                                        startblock = false;
×
667
                                        endblock = false;
×
668
                                }
669
                                // set corresponding token
670
                    if (block != null)
1✔
671
                                        tokens = block.getTokens();
1✔
672
                                else
673
                                        tokens = null;
×
674

675
                                if (text.equals("\n") || text.equals("\r")) {
1✔
676
                                        previousNewline = true;
1✔
677
                    nn = 0;
1✔
678
                                        currentLineLength = 0;
1✔
679
                                        currentLineProfile = null;
1✔
680
                                        //lineIndex++;
681
                                        //currentLineLength = lineLengths.get(lineIndex);
682
                    continue;
1✔
683
                }
684
                                else {
685
                    newline = false;
1✔
686
                                        nn += text.length(); // +1 for segmentation symbol
1✔
687
                                }
688

689
                                if (text.equals(" ") || text.equals("\t")) {
1✔
690
                    nn++;
1✔
691
                    continue;
1✔
692
                                }
693

694
                                if (text.trim().length() == 0) {
1✔
695
                                        continue;
×
696
                                }
697

698
                LayoutToken token = null;
1✔
699
                if (tokens != null) {
1✔
700
                    int i = tokenIndex;
1✔
701
                    while (i < tokens.size()) {
1✔
702
                        token = tokens.get(i);
1✔
703
                        if (text.equals(token.getText())) {
1✔
704
                            tokenIndex = i;
1✔
705
                            break;
1✔
706
                        }
707
                        i++;
1✔
708
                    }
709
                }
710

711
                if (previousNewline) {
1✔
712
                    newline = true;
1✔
713
                    previousNewline = false;
1✔
714
                                        if (token != null && previousFeatures != null) {
1✔
715
                                                double previousLineStartX = lineStartX;
1✔
716
                        lineStartX = token.getX();
1✔
717
                        double characterWidth = token.width / token.getText().length();
1✔
718
                                                if (!Double.isNaN(previousLineStartX)) {
1✔
719
                            // Indentation if line start is > 1 character width to the right of previous line start
720
                            if (lineStartX - previousLineStartX > characterWidth)
1✔
721
                                                    indented = true;
1✔
722
                                                // Indentation ends if line start is > 1 character width to the left of previous line start
723
                            else if (previousLineStartX - lineStartX > characterWidth)
1✔
724
                                indented = false;
1✔
725
                            // Otherwise indentation is unchanged
726
                                                }
727
                                        }
728
                }
729

730
                                if (TextUtilities.filterLine(text)) {
1✔
731
                    continue;
×
732
                }
733

734
                features = new FeaturesVectorReferenceSegmenter();
1✔
735
                features.token = token;
1✔
736
                features.string = text;
1✔
737

738
                if (newline) {
1✔
739
                    features.lineStatus = "LINESTART";
1✔
740
                }
741
                Matcher m0 = featureFactory.isPunct.matcher(text);
1✔
742
                if (m0.find()) {
1✔
743
                    features.punctType = "PUNCT";
1✔
744
                }
745
                if (text.equals("(") || text.equals("[")) {
1✔
746
                    features.punctType = "OPENBRACKET";
1✔
747

748
                } else if (text.equals(")") || text.equals("]")) {
1✔
749
                    features.punctType = "ENDBRACKET";
1✔
750

751
                } else if (text.equals(".")) {
1✔
752
                    features.punctType = "DOT";
1✔
753

754
                } else if (text.equals(",")) {
1✔
755
                    features.punctType = "COMMA";
1✔
756

757
                } else if (text.equals("-")) {
1✔
758
                    features.punctType = "HYPHEN";
1✔
759

760
                } else if (text.equals("\"") || text.equals("\'") || text.equals("`")) {
1✔
761
                    features.punctType = "QUOTE";
1✔
762

763
                }
764

765
                if ( (n == 0) || (previousNewline) ) {
1✔
766
                    features.lineStatus = "LINESTART";
×
767
                                        if (n == 0)
×
768
                                                features.blockStatus = "BLOCKSTART";
×
769
                                        nn = 0;
×
770
                }
771

772
                if (indented) {
1✔
773
                        features.alignmentStatus = "LINEINDENT";
1✔
774
                }
775
                else {
776
                        features.alignmentStatus = "ALIGNEDLEFT";
1✔
777
                }
778

779
                                {
780
                    // look ahead...
781
                    boolean endline = true;
1✔
782

783
                    int ii = 1;
1✔
784
                    boolean endloop = false;
1✔
785
                                        String accumulated = text;
1✔
786
                    while ((n + ii < tokenizations.size()) && (!endloop)) {
1✔
787
                        String tok = tokenizations.get(n + ii).getText();
1✔
788
                        if (tok != null) {
1✔
789
                                                        if (currentLineProfile == null)
1✔
790
                                                                accumulated += tok;
1✔
791
                            if (tok.equals("\n") || tok.equals("\r")) {
1✔
792
                                endloop = true;
1✔
793
                                                                if (currentLineLength ==0) {
1✔
794
                                                                        currentLineLength = accumulated.length();
1✔
795
                                                                }
796
                                                                if (currentLineProfile == null) {
1✔
797
                                                                        currentLineProfile = TextUtilities.punctuationProfile(accumulated);
1✔
798
                                                                }
799
                            }
800
                                                        else if (!tok.equals(" ") && !tok.equals("\t")) {
1✔
801
                                                                endline = false;
1✔
802
                                                        }
803
                                                        else {
804
                                if (TextUtilities.filterLine(tok)) {
1✔
805
                                    endloop = true;
×
806
                                                                        if (currentLineLength ==0) {
×
807
                                                                                currentLineLength = accumulated.length();
×
808
                                                                        }
809
                                                                        if (currentLineProfile == null) {
×
810
                                                                                currentLineProfile = TextUtilities.punctuationProfile(accumulated);
×
811
                                                                        }
812
                                }
813
                            }
814
                        }
815

816
                        if (n + ii >= tokenizations.size() - 1) {
1✔
817
                            endblock = true;
×
818
                            endline = true;
×
819
                        }
820

821
                                                if (endline && (block != null) && (n+ii == block.getEndToken())) {
1✔
822
                                                        endblock = true;
1✔
823
                                                }
824
                        ii++;
1✔
825
                    }
1✔
826

827
                    if ((!endline) && !(newline)) {
1✔
828
                        features.lineStatus = "LINEIN";
1✔
829
                    }
830
                                        else if (!newline) {
1✔
831
                        features.lineStatus = "LINEEND";
1✔
832
                        previousNewline = true;
1✔
833
                    }
834

835
                                        if (startblock) {
1✔
836
                                                features.blockStatus = "BLOCKSTART";
1✔
837
                                        }
838
                    if ((!endblock) && (features.blockStatus == null))
1✔
839
                        features.blockStatus = "BLOCKIN";
1✔
840
                    else if (features.blockStatus == null) {
1✔
841
                        features.blockStatus = "BLOCKEND";
1✔
842
                    }
843
                }
844

845
                if (text.length() == 1) {
1✔
846
                    features.singleChar = true;
1✔
847
                }
848

849
                if (Character.isUpperCase(text.charAt(0))) {
1✔
850
                    features.capitalisation = "INITCAP";
1✔
851
                }
852

853
                if (featureFactory.test_all_capital(text)) {
1✔
854
                    features.capitalisation = "ALLCAP";
1✔
855
                }
856

857
                if (featureFactory.test_digit(text)) {
1✔
858
                    features.digit = "CONTAINSDIGITS";
1✔
859
                }
860

861
                if (featureFactory.test_common(text)) {
1✔
862
                    features.commonName = true;
1✔
863
                }
864

865
                if (featureFactory.test_names(text)) {
1✔
866
                    features.properName = true;
1✔
867
                }
868

869
                if (featureFactory.test_month(text)) {
1✔
870
                    features.month = true;
1✔
871
                }
872

873
                Matcher m = featureFactory.isDigit.matcher(text);
1✔
874
                if (m.find()) {
1✔
875
                    features.digit = "ALLDIGIT";
1✔
876
                }
877

878
                Matcher m2 = featureFactory.year.matcher(text);
1✔
879
                if (m2.find()) {
1✔
880
                    features.year = true;
1✔
881
                }
882

883
                Matcher m3 = featureFactory.email.matcher(text);
1✔
884
                if (m3.find()) {
1✔
885
                    features.email = true;
×
886
                }
887

888
                Matcher m4 = featureFactory.http.matcher(text);
1✔
889
                if (m4.find()) {
1✔
890
                    features.http = true;
1✔
891
                }
892

893
                if ( (token != null) && (token.getBold()) )
1✔
894
                    features.bold = true;
1✔
895

896
                if ( (token != null) && (token.getItalic()) )
1✔
897
                    features.italic = true;
1✔
898

899
                if (features.capitalisation == null)
1✔
900
                    features.capitalisation = "NOCAPS";
1✔
901

902
                if (features.digit == null)
1✔
903
                    features.digit = "NODIGIT";
1✔
904

905
                if (features.punctType == null)
1✔
906
                    features.punctType = "NOPUNCT";
1✔
907
//System.out.println(nn + "\t" + currentLineLength + "\t" + maxLineLength);
908
                features.lineLength = featureFactory
1✔
909
                        .linearScaling(currentLineLength, maxLineLength, LINESCALE);
1✔
910

911
                                features.relativePosition = featureFactory
1✔
912
                         .linearScaling(nn, currentLineLength, LINESCALE);
1✔
913

914
                                features.punctuationProfile = currentLineProfile;
1✔
915

916
                if (previousFeatures != null)
1✔
917
                    citations.append(previousFeatures.printVector());
1✔
918
                //mm++;
919
                previousFeatures = features;
1✔
920
                        }
921
                }
1✔
922
                if (previousFeatures != null)
1✔
923
                      citations.append(previousFeatures.printVector());
1✔
924

925
                   return Pair.of(citations.toString(), tokenizationsReferences);
1✔
926
        }
927
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc