• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kermitt2 / grobid / 385

pending completion
385

push

circleci

review incremental training

3 of 3 new or added lines in 2 files covered. (100.0%)

14846 of 37503 relevant lines covered (39.59%)

0.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidFilterDeleteSpaceBetweenSameAlphabet.java
1
/**
2
 * Licensed to the Apache Software Foundation (ASF) under one or more
3
 * contributor license agreements.  See the NOTICE file distributed with
4
 * this work for additional information regarding copyright ownership.
5
 * The ASF licenses this file to You under the Apache License, Version 2.0
6
 * (the "License"); you may not use this file except in compliance with
7
 * the License.  You may obtain a copy of the License at
8
 *
9
 *     http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 */
17

18
/**
19
 * @author Bruno Pouliquen @ WIPO
20
 */
21

22
package org.grobid.core.analyzers;
23

24
import java.io.IOException;
25

26
import org.apache.lucene.analysis.TokenFilter;
27
import org.apache.lucene.analysis.TokenStream;
28
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
30
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
31
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
32

33
public final class GrobidFilterDeleteSpaceBetweenSameAlphabet extends TokenFilter {
34
        private CharTermAttribute termAttr;
35
        private TypeAttribute typeAttr;
36
        private PositionIncrementAttribute posAttr;
37
        private OffsetAttribute offsetAttr;
38

39
        private String previousBuffer;
40
        private int previousBufferLength=0;
×
41
        private String previousType=null;
×
42
        private int previousStartOffset=0;
×
43
        private int previousEndOffset=0;
×
44
        private int previousPosIncr=0;
×
45
        
46
        public GrobidFilterDeleteSpaceBetweenSameAlphabet (TokenStream input) {
47
                super(input);
×
48
                termAttr = (CharTermAttribute) addAttribute(CharTermAttribute.class);
×
49
            typeAttr=(TypeAttribute) addAttribute(TypeAttribute.class);
×
50
            offsetAttr = (OffsetAttribute) addAttribute(OffsetAttribute.class);;
×
51
            posAttr = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
×
52
            previousBuffer=null;
×
53
        }
×
54
        
55
        
56
        public boolean incrementToken() throws IOException {
57
                
58
                if (previousBuffer != null) {
×
59
                           termAttr.setEmpty().append(previousBuffer);
×
60
                            typeAttr.setType(previousType);
×
61
                            offsetAttr.setOffset(previousStartOffset, previousEndOffset);        
×
62
                            posAttr.setPositionIncrement(previousPosIncr);
×
63
                            previousBuffer= null;
×
64
                            return true;
×
65
            }
66
                
67
                if (!input.incrementToken()) { //#B                        
×
68
                          return false; //#C
×
69
                }
70
                char[] buffer = termAttr.buffer(); 
×
71
                if (! isLatinChar(buffer[0])) return true;
×
72
                if (isDigit(buffer[0])) {
×
73
                //        if (!isNumeral(previousBuffer)) return true;
74
                }
75
                
76
                previousBuffer=termAttr.toString();
×
77
                previousBufferLength=termAttr.length();
×
78
                previousType=typeAttr.type();
×
79
                previousStartOffset=offsetAttr.startOffset();
×
80
                previousEndOffset=offsetAttr.endOffset();
×
81
                previousPosIncr = posAttr.getPositionIncrement();
×
82
                
83
            boolean cont=true;
×
84
            String currentBuffer=null;
×
85
            int currentBufferLength=0;
×
86
            String currentType=null;
×
87
            int currentStartOffset=-1;
×
88
            int currentEndOffset=-1;
×
89
            int currentPosIncr=0;
×
90
            while (cont && input.incrementToken()) {
×
91
                    currentBuffer=termAttr.toString();
×
92
                    currentBufferLength=termAttr.length();
×
93
                        currentType=typeAttr.type();
×
94
                        currentStartOffset=offsetAttr.startOffset();
×
95
                        currentEndOffset=offsetAttr.endOffset();
×
96
                        currentPosIncr = posAttr.getPositionIncrement();
×
97
                        
98
                        // Series of conditions to concatenate tokens:
99
                        if ((
×
100
                                        buffer[0]=='.' && isNumeral(previousBuffer) // 0 . => 0.
×
101
                                        ) || (
102
                                                        isNumeral(currentBuffer) && isNumeral(previousBuffer) // 1 2 => 12
×
103
                                        ) || (
104
                                                        previousBuffer.endsWith(".") && isNumeral(previousBuffer)
×
105
                                                        && isNumeral(currentBuffer) // 0. 1 => 0.1
×
106
                                        ) || (
107
                                                        currentStartOffset >= previousEndOffset && isLatinChar(buffer[0]) 
×
108
                                                        && currentType.equals(previousType) 
×
109
                                                        && ( 
110
                                                                        !(isNumeral(previousBuffer) && ! isNumeral(currentBuffer))
×
111
                                                                        
112
                                                                        )
113
                                                                        && (! (isNumeral(currentBuffer) && ! isNumeral(previousBuffer)))
×
114
                                                        // a b => ab
115
                                        )
116
                                        
117
                                ) {
118
                         //current token has the same alphabet, we concatenate them
119
                            String n = previousBuffer + currentBuffer;
×
120
                            previousBuffer=n; 
×
121
                            currentBuffer=null;
×
122
                            previousEndOffset=currentEndOffset;
×
123
                    } else {
×
124
                            cont=false; break;
×
125
                    }
126
            }
127

128
            termAttr.setEmpty().append(previousBuffer);
×
129
            typeAttr.setType(previousType);
×
130
            offsetAttr.setOffset(previousStartOffset, previousEndOffset);        
×
131
            posAttr.setPositionIncrement(previousPosIncr);
×
132
            previousBuffer= null;
×
133
            
134
            if (currentBuffer != null) {
×
135
                    previousBuffer=currentBuffer;
×
136
                    previousBufferLength=currentBufferLength;
×
137
                    previousType=currentType;
×
138
                    previousStartOffset=currentStartOffset;
×
139
                    previousEndOffset=currentEndOffset;
×
140
                    previousPosIncr = currentPosIncr;
×
141
            }
142
                return true;        
×
143
        }
144

145

146
        private boolean isDigit(char c) {
147
                return ((c>='0' && c<='9') || (c>=0xFF10 && c<=0xFF19));
×
148
        }
149

150

151
        private boolean isLatinChar(char c) {
152
                return ((c >='a' && c<='z') || (c >='A' && c<='Z')
×
153
                                || (c >='0' && c<='9')  || (c>=0xFF10 && c<=0xFF19) || (c>=0xFF01 && c<=0xFF5E)
154
                                );
155

156
        }
157
        private boolean isNumeral(String s) {
158
                return (s!=null && !s.isEmpty() && isDigit(s.charAt(0)));
×
159
        }
160

161
        @Override
162
          public void reset() throws IOException {
163
            super.reset();
×
164
            previousBuffer = null;
×
165
            previousBufferLength=0;
×
166
            previousType=null;
×
167
            previousStartOffset=0;
×
168
            previousEndOffset=0;
×
169
            previousPosIncr=0;
×
170
        }
×
171
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc