• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 13883000244

16 Mar 2025 11:44AM CUT coverage: 59.034% (-1.0%) from 60.072%
13883000244

Pull #14444

github

web-flow
Merge 6d717703b into 05000ab4a
Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.85
/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/FeatureGenerator.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.ner.crf
18

19
import com.johnsnowlabs.ml.crf._
20
import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, WordpieceEmbeddingsSentence}
21

22
import scala.collection.mutable
23

24
/** Generates features for CrfBasedNer */
25
case class FeatureGenerator(dictFeatures: DictionaryFeatures) {
26

27
  val shapeEncoding = Map(
1✔
28
    '.' -> '.',
1✔
29
    ',' -> '.',
1✔
30
    ':' -> ':',
1✔
31
    ';' -> ':',
1✔
32
    '?' -> ':',
1✔
33
    '!' -> ':',
1✔
34
    '-' -> '-',
1✔
35
    '+' -> '-',
1✔
36
    '*' -> '-',
1✔
37
    '/' -> '-',
1✔
38
    '=' -> '-',
1✔
39
    '|' -> '-',
1✔
40
    '_' -> '-',
1✔
41
    '%' -> '-',
1✔
42
    '(' -> '(',
1✔
43
    '{' -> '(',
1✔
44
    '[' -> '(',
1✔
45
    '<' -> '(',
1✔
46
    ')' -> ')',
1✔
47
    '}' -> ')',
1✔
48
    ']' -> ')',
1✔
49
    '>' -> ')')
1✔
50

51
  def getShape(token: String) = {
52
    token.map(c => {
1✔
53
      if (c.isLower)
1✔
54
        'L'
1✔
55
      else if (c.isUpper)
1✔
56
        'U'
1✔
57
      else if (c.isDigit)
1✔
58
        'D'
×
59
      shapeEncoding.getOrElse(c, c)
1✔
60
    })
61
  }
62

63
  def shrink(str: String) = {
64
    val builder = new StringBuilder()
1✔
65
    for (c <- str) {
1✔
UNCOV
66
      if (builder.length == 0 || builder.last != c)
×
67
        builder.append(c)
1✔
68
    }
69
    builder.toString
1✔
70
  }
71

72
  object TokenType extends Enumeration {
73
    type TokenType = Value
74
    val AllUpper = Value(1 << 0)
1✔
75
    val AllDigit = Value(1 << 1)
1✔
76
    val AllSymbol = Value(1 << 2)
1✔
77
    val AllUpperDigit = Value(1 << 3)
1✔
78
    val AllUpperSymbol = Value(1 << 4)
1✔
79
    val AllDigitSymbol = Value(1 << 5)
1✔
80
    val AllUpperDigitSymbol = Value(1 << 6)
1✔
81
    val StartsUpper = Value(1 << 7)
1✔
82
    val AllLetter = Value(1 << 8)
1✔
83
    val AllAlnum = Value(1 << 9)
1✔
84

85
    val allTypes = values.max.id * 2 - 1
1✔
86
  }
87

88
  val digitDelims = Seq(',', '.')
1✔
89

90
  def getType(token: String): Int = {
91
    var types = TokenType.allTypes
1✔
92

93
    def remove(t: TokenType.TokenType) = {
94
      types = types & (~t.id)
1✔
95
    }
96

97
    var isFirst = true
1✔
98
    for (c <- token) {
1✔
99
      if (c.isUpper) {
1✔
100
        remove(TokenType.AllDigit)
1✔
101
        remove(TokenType.AllSymbol)
1✔
102
        remove(TokenType.AllDigitSymbol)
1✔
103
      } else if (c.isDigit || digitDelims.contains(c)) {
1✔
104
        remove(TokenType.AllUpper)
1✔
105
        remove(TokenType.AllSymbol)
1✔
106
        remove(TokenType.AllUpperSymbol)
1✔
107
        remove(TokenType.AllLetter)
1✔
108
      } else if (c.isLower) {
1✔
109
        remove(TokenType.AllUpper)
1✔
110
        remove(TokenType.AllDigit)
1✔
111
        remove(TokenType.AllSymbol)
1✔
112
        remove(TokenType.AllUpperDigit)
1✔
113
        remove(TokenType.AllUpperSymbol)
1✔
114
        remove(TokenType.AllDigitSymbol)
1✔
115
        remove(TokenType.AllUpperDigitSymbol)
1✔
116
      } else {
×
117
        remove(TokenType.AllUpper)
×
118
        remove(TokenType.AllDigit)
×
119
        remove(TokenType.AllUpperDigit)
×
120
        remove(TokenType.AllLetter)
×
121
        remove(TokenType.AllAlnum)
×
122
      }
123

124
      if (isFirst && !c.isUpper)
1✔
125
        remove(TokenType.StartsUpper)
1✔
126

127
      isFirst = false
1✔
128
    }
129

130
    val result = TokenType.values
131
      .filter(value => (value.id & types) > 0)
1✔
132
      .map(value => value.id)
1✔
133
      .headOption
1✔
134

135
    result.getOrElse(0)
1✔
136
  }
137

138
  def isDigitOrPredicate(token: String, predicate: Function[Char, Boolean]) = {
139
    var hasDigits = false
1✔
140
    var hasPredicate = false
1✔
141
    var hasOther = false
1✔
142
    for (c <- token) {
1✔
143
      hasDigits = hasDigits || c.isDigit
1✔
144
      hasPredicate = hasPredicate || predicate(c)
1✔
145
      hasOther = hasOther || !c.isLetterOrDigit
1✔
146
    }
147
    !hasOther && hasDigits && hasPredicate
1✔
148
  }
149

150
  def isAllSymbols(token: String) = {
151
    !token.forall(c => c.isLetterOrDigit)
×
152
  }
153

154
  def isShort(token: String) = {
155
    token.length == 2 && token(0).isUpper && token(1) == '.'
×
156
  }
157

158
  def containsUpper(token: String) = token.exists(c => c.isUpper)
1✔
159

160
  def containsLower(token: String) = token.exists(c => c.isLower)
1✔
161

162
  def containsLetter(token: String) = token.exists(c => c.isLetter)
1✔
163

164
  def containsDigit(token: String) = token.exists(c => c.isDigit)
1✔
165

166
  def containsSymbol(token: String) = token.exists(c => c.isLetterOrDigit)
1✔
167

168
  def getSuffix(token: String, size: Int, default: String = "") = {
169
    if (token.length >= size)
1✔
170
      token.substring(token.length - size).toLowerCase
1✔
171
    else
172
      default
1✔
173
  }
174

175
  def getPrefix(token: String, size: Int, default: String = "") = {
176
    if (token.length >= size)
1✔
177
      token.substring(0, size).toLowerCase
1✔
178
    else
179
      default
1✔
180
  }
181

182
  def fillFeatures(token: String): mutable.Map[String, String] = {
183
    val f = mutable.Map[String, String]()
1✔
184
    f("w") = token
1✔
185
    f("wl") = token.toLowerCase
1✔
186

187
    f("s") = getShape(token)
1✔
188
    f("h") = shrink(f("s"))
1✔
189
    f("t") = getType(token).toString
1✔
190

191
    f("p1") = getPrefix(token, 1)
1✔
192
    f("p2") = getPrefix(token, 2)
1✔
193
    f("p3") = getPrefix(token, 3)
1✔
194
    f("p4") = getPrefix(token, 4)
1✔
195

196
    f("s1") = getSuffix(token, 1)
1✔
197
    f("s2") = getSuffix(token, 2)
1✔
198
    f("s3") = getSuffix(token, 3)
1✔
199
    f("s4") = getSuffix(token, 4)
1✔
200

201
    f("dl") = isDigitOrPredicate(token, c => c.isLetter).toString
1✔
202
    f("d-") = isDigitOrPredicate(token, c => c == '-').toString
1✔
203
    f("d/") = isDigitOrPredicate(token, c => c == '/').toString
1✔
204
    f("d,") = isDigitOrPredicate(token, c => c == ',').toString
1✔
205
    f("d.") = isDigitOrPredicate(token, c => c == '.').toString
1✔
206

207
    f("u.") = isShort(token).toString
1✔
208
    f("iu") = (token.nonEmpty && token(0).isUpper).toString
1✔
209

210
    f("cu") = containsUpper(token).toString
1✔
211
    f("cl") = containsLower(token).toString
1✔
212
    f("ca") = containsLetter(token).toString
1✔
213
    f("cd") = containsDigit(token).toString
1✔
214
    f("cs") = containsSymbol(token).toString
1✔
215

216
    f
217
  }
218

219
  val pairs = Array("w", "pos", "h", "t")
1✔
220
  val window = 2
1✔
221

222
  def isInRange(idx: Int, size: Int) = idx >= 0 && idx < size
1✔
223

224
  def getName(source: String, idx: Int): String = {
225
    source + "~" + idx
1✔
226
  }
227

228
  def getName(source: String, idx1: Int, idx2: Int): String = {
229
    getName(source, idx1) + "|" + getName(source, idx2)
1✔
230
  }
231

232
  def generate(
233
      taggedSentence: TaggedSentence,
234
      wordpieceEmbeddingsSentence: WordpieceEmbeddingsSentence): TextSentenceAttrs = {
235

236
    val wordFeatures = taggedSentence.words
1✔
237
      .zip(taggedSentence.tags)
1✔
238
      .map { case (word, tag) =>
1✔
239
        val f = fillFeatures(word)
1✔
240
        f("pos") = tag
1✔
241
        f
242
      }
243

244
    val words = wordFeatures.length
1✔
245

246
    var wordsList = taggedSentence.words.toList
1✔
247
    val embeddings = wordpieceEmbeddingsSentence.tokens
1✔
248
      .filter(t => t.isWordStart)
1✔
249
      .map(t => t.embeddings)
1✔
250

251
    assert(
1✔
252
      embeddings.length == wordsList.length,
1✔
253
      "Mismatched embedding tokens and sentence tokens. Make sure you are properly " +
×
254
        "linking tokens and embeddings to the same inputCol DOCUMENT annotator")
255

256
    val attrs = (0 until words).map { i =>
1✔
257
      val pairAttrs = (-window until window)
1✔
258
        .filter(j => isInRange(i + j, words) && isInRange(i + j + 1, words))
1✔
259
        .flatMap(j =>
1✔
260
          pairs.map { name =>
1✔
261
            val feature = getName(name, j, j + 1)
1✔
262
            val value1 = wordFeatures(i + j).getOrElse(name, "")
1✔
263
            val value2 = wordFeatures(i + j + 1).getOrElse(name, "")
1✔
264
            (feature, value1 + "|" + value2)
1✔
265
          })
266
        .toArray
1✔
267

268
      val unoAttrs = (-window to window)
1✔
269
        .filter(j => isInRange(i + j, words))
1✔
270
        .flatMap { j =>
1✔
271
          wordFeatures(i + j).map { case (name, value) =>
1✔
272
            (getName(name, j), value)
1✔
273
          }
274
        }
275
        .toArray
1✔
276

277
      val dictAttrs = dictFeatures.get(wordsList).map((getName("dt", i), _))
1✔
278
      wordsList = wordsList.tail
1✔
279

280
      val addition =
281
        if (i == 0) Array(("_BOS_", ""))
1✔
282
        else if (i == words - 1) Array(("_EOS_", ""))
1✔
283
        else Array.empty[(String, String)]
1✔
284

285
      val binAttrs = pairAttrs ++ unoAttrs ++ dictAttrs ++ addition
1✔
286

287
      val numAttrs = embeddings(i)
1✔
288

289
      WordAttrs(binAttrs, numAttrs)
1✔
290
    }
291

292
    TextSentenceAttrs(attrs)
1✔
293
  }
294

295
  def generateDataset(sentences: TraversableOnce[
296
    (TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence)]): CrfDataset = {
297
    val textDataset = sentences
298
      .filter(p => p._2.words.length > 0)
1✔
299
      .map { case (labels, sentence, withEmbeddings) =>
1✔
300
        val textSentence = generate(sentence, withEmbeddings)
1✔
301
        (labels, textSentence)
1✔
302
      }
303

304
    DatasetReader.encodeDataset(textDataset)
1✔
305
  }
306

307
  def generate(
308
      sentence: TaggedSentence,
309
      withEmbeddings: WordpieceEmbeddingsSentence,
310
      metadata: DatasetMetadata): Instance = {
311
    val attrSentence = generate(sentence, withEmbeddings)
1✔
312

313
    DatasetReader.encodeSentence(attrSentence, metadata)
1✔
314
  }
315
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc