• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 18652478786

20 Oct 2025 12:47PM UTC coverage: 55.25% (+0.2%) from 55.094%
18652478786

Pull #14674

github

web-flow
Merge b08968fc1 into b827818c7
Pull Request #14674: SPARKNLP-1293 Enhancements EntityRuler and DocumentNormalizer

114 of 149 new or added lines in 3 files covered. (76.51%)

40 existing lines in 36 files now uncovered.

11919 of 21573 relevant lines covered (55.25%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.55
/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModel.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.spell.symmetric
18

19
import com.johnsnowlabs.nlp.annotators.spell.util.Utilities
20
import com.johnsnowlabs.nlp.serialization.MapFeature
21
import com.johnsnowlabs.nlp._
22
import org.apache.spark.ml.util.Identifiable
23
import org.slf4j.LoggerFactory
24

25
import scala.collection.immutable.HashSet
26
import scala.collection.mutable.{Map => MMap}
27
import scala.util.control.Breaks._
28

29
/** Symmetric Delete spelling correction algorithm.
30
  *
31
  * The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
32
  * generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
33
  * magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
34
  * and language independent.
35
  *
36
  * Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
37
  *
38
  * Pretrained models can be loaded with `pretrained` of the companion object:
39
  * {{{
40
  * val spell = SymmetricDeleteModel.pretrained()
41
  *   .setInputCols("token")
42
  *   .setOutputCol("spell")
43
  * }}}
44
  * The default model is `"spellcheck_sd"`, if no name is provided. For available pretrained
45
  * models please see the [[https://sparknlp.org/models?task=Spell+Check Models Hub]].
46
  *
47
  * See
48
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
49
  * for further reference.
50
  *
51
  * ==Example==
52
  * {{{
53
  * import spark.implicits._
54
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
55
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
56
  * import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel
57
  * import org.apache.spark.ml.Pipeline
58
  *
59
  * val documentAssembler = new DocumentAssembler()
60
  *   .setInputCol("text")
61
  *   .setOutputCol("document")
62
  *
63
  * val tokenizer = new Tokenizer()
64
  *   .setInputCols("document")
65
  *   .setOutputCol("token")
66
  *
67
  * val spellChecker = SymmetricDeleteModel.pretrained()
68
  *   .setInputCols("token")
69
  *   .setOutputCol("spell")
70
  *
71
  * val pipeline = new Pipeline().setStages(Array(
72
  *   documentAssembler,
73
  *   tokenizer,
74
  *   spellChecker
75
  * ))
76
  *
77
  * val data = Seq("spmetimes i wrrite wordz erong.").toDF("text")
78
  * val result = pipeline.fit(data).transform(data)
79
  * result.select("spell.result").show(false)
80
  * +--------------------------------------+
81
  * |result                                |
82
  * +--------------------------------------+
83
  * |[sometimes, i, write, words, wrong, .]|
84
  * +--------------------------------------+
85
  * }}}
86
  *
87
  * @see
88
  *   [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel NorvigSweetingModel]] for
89
  *   an alternative approach to spell checking
90
  * @see
91
  *   [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel ContextSpellCheckerModel]]
92
  *   for a DL based approach
93
  * @groupname anno Annotator types
94
  * @groupdesc anno
95
  *   Required input and expected output annotator types
96
  * @groupname Ungrouped Members
97
  * @groupname param Parameters
98
  * @groupname setParam Parameter setters
99
  * @groupname getParam Parameter getters
100
  * @groupname Ungrouped Members
101
  * @groupprio param  1
102
  * @groupprio anno  2
103
  * @groupprio Ungrouped 3
104
  * @groupprio setParam  4
105
  * @groupprio getParam  5
106
  * @groupdesc param
107
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
108
  *   parameter values through setters and getters, respectively.
109
  */
110
class SymmetricDeleteModel(override val uid: String)
111
    extends AnnotatorModel[SymmetricDeleteModel]
112
    with HasSimpleAnnotate[SymmetricDeleteModel]
113
    with SymmetricDeleteParams {
114

115
  import com.johnsnowlabs.nlp.AnnotatorType._
116

117
  def this() = this(Identifiable.randomUID("SYMSPELL"))
1✔
118

119
  /** Output annotator type: TOKEN
120
    *
121
    * @group anno
122
    */
123
  override val outputAnnotatorType: AnnotatorType = TOKEN
1✔
124

125
  /** Input annotator type: TOKEN
126
    *
127
    * @group anno
128
    */
129
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)
1✔
130

131
  protected val derivedWords: MapFeature[String, (List[String], Long)] =
132
    new MapFeature(this, "derivedWords")
1✔
133

134
  protected val dictionary: MapFeature[String, Long] = new MapFeature(this, "dictionary")
1✔
135

136
  /** @group setParam */
137
  def setDictionary(value: Map[String, Long]): this.type = set(dictionary, value)
1✔
138

139
  /** @group setParam */
140
  def setDerivedWords(value: Map[String, (List[String], Long)]): this.type =
141
    set(derivedWords, value)
1✔
142

143
  private val logger = LoggerFactory.getLogger("SymmetricDeleteApproach")
1✔
144

145
  private lazy val allWords: HashSet[String] = {
146
    HashSet($$(derivedWords).keys.toSeq.map(_.toLowerCase): _*)
147
  }
148

149
  private val CAPITAL = 'C'
1✔
150
  private val LOWERCASE = 'L'
1✔
151
  private val UPPERCASE = 'U'
1✔
152

153
  case class SuggestedWord(correction: String, frequency: Long, distance: Int, score: Double)
154

155
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
156
    annotations.map { token =>
1✔
157
      {
158
        val verifiedWord = checkSpellWord(token.result)
1✔
159
        Annotation(
1✔
160
          outputAnnotatorType,
1✔
161
          token.begin,
1✔
162
          token.end,
1✔
163
          verifiedWord._1,
1✔
164
          Map("confidence" -> verifiedWord._2.toString))
1✔
165
      }
166
    }
167
  }
168

169
  def checkSpellWord(originalWord: String): (String, Double) = {
170
    logger.debug(s"spell checker target word: $originalWord")
1✔
171
    var score: Double = 0
1✔
172
    if (isNoisyWord(originalWord)) {
1✔
173
      return (originalWord, score)
1✔
174
    }
175
    var transformedWord = originalWord
176
    val originalCaseType = getCaseWordType(originalWord)
1✔
177
    val suggestedWord = getSuggestedCorrections(originalWord)
1✔
178
    if (suggestedWord.isDefined) {
1✔
179
      logger.debug(
1✔
180
        s"Received: $originalWord. Best correction is: $suggestedWord. " +
1✔
181
          s"Because frequency was ${suggestedWord.get.frequency} " +
1✔
182
          s"and edit distance was ${suggestedWord.get.distance}")
1✔
183
      transformedWord =
184
        transformToOriginalCaseType(originalCaseType, suggestedWord.get.correction)
1✔
185
      score = suggestedWord.get.score
1✔
186
    }
187

188
    (transformedWord, score)
1✔
189
  }
190

191
  def isNoisyWord(word: String): Boolean = {
192
    val noisyWordRegex = "[^a-zA-Z]".r
1✔
193
    val matchNoisyWord = noisyWordRegex.findFirstMatchIn(word)
1✔
194

195
    if (matchNoisyWord.isEmpty) {
1✔
196
      false
1✔
197
    } else {
198
      true
1✔
199
    }
200
  }
201

202
  def getCaseWordType(word: String): Char = {
203
    val firstLetter = word(0).toString
1✔
204
    val matchUpperCaseFirstLetter = "[A-Z]".r.findFirstMatchIn(firstLetter)
1✔
205

206
    var caseType = UPPERCASE
1✔
207

208
    word.foreach { letter =>
1✔
209
      val matchUpperCase = "[A-Z]".r.findFirstMatchIn(letter.toString)
1✔
210
      if (matchUpperCase.isEmpty) {
1✔
211
        if (matchUpperCaseFirstLetter.nonEmpty) {
1✔
212
          caseType = CAPITAL
1✔
213
        } else {
214
          caseType = LOWERCASE
1✔
215
        }
216
      }
217
    }
218

219
    caseType
220
  }
221

222
  def transformToOriginalCaseType(caseType: Char, word: String): String = {
223

224
    var transformedWord = word
225

226
    if (caseType == CAPITAL) {
1✔
227
      val firstLetter = word(0).toString
1✔
228
      transformedWord = word.replaceFirst(firstLetter, firstLetter.toUpperCase)
1✔
229
    } else if (caseType == UPPERCASE) {
1✔
230
      transformedWord = word.toUpperCase
1✔
231
    }
232
    transformedWord
233
  }
234

235
  /** Return list of suggested corrections for potentially incorrectly spelled word */
236
  def getSuggestedCorrections(word: String): Option[SuggestedWord] = {
237
    val cleanWord = Utilities.limitDuplicates($(dupsLimit), word)
1✔
238
    if (get(dictionary).isDefined) {
1✔
239
      getDictionarySuggestions(cleanWord)
1✔
240
    } else {
241
      getSymmetricSuggestions(cleanWord)
1✔
242
    }
243
  }
244

245
  def getDictionarySuggestions(word: String): Option[SuggestedWord] = {
246
    if ($$(dictionary).contains(word)) {
1✔
247
      logger.debug("Word found in dictionary. No spell change")
1✔
248
      val score = getScoreFrequency(word)
1✔
249
      getSuggestedWord(Some((word, (0, 0))), score)
1✔
250
    } else if ($$(dictionary).contains(word.distinct)) {
1✔
251
      logger.debug("Word as distinct found in dictionary")
1✔
252
      val score = getScoreFrequency(word.distinct)
1✔
253
      getSuggestedWord(Some((word.distinct, (0, 0))), score)
1✔
254
    } else
255
      getSymmetricSuggestions(word)
1✔
256
  }
257

258
  def getScoreFrequency(word: String): Double = {
259
    val frequency = Utilities.getFrequency(word, $$(dictionary))
1✔
260
    normalizeFrequencyValue(frequency)
1✔
261
  }
262

263
  def normalizeFrequencyValue(value: Long): Double = {
264
    if (value > $(maxFrequency)) {
1✔
265
      return 1
×
266
    }
267
    if (value < $(minFrequency)) {
1✔
268
      return 0
×
269
    }
UNCOV
270
    if ($(maxFrequency) == $(minFrequency)) {
×
271
      return 1
1✔
272
    }
273
    val normalizedValue =
274
      (value - $(maxFrequency)).toDouble / ($(maxFrequency) - $(minFrequency)).toDouble
×
275
    BigDecimal(normalizedValue).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble
×
276
  }
277

278
  def getSymmetricSuggestions(word: String): Option[SuggestedWord] = {
279
    val lowercaseWord = word.toLowerCase()
1✔
280
    val lowercaseWordLength = lowercaseWord.length
1✔
281
    if (lowercaseWordLength - $(longestWordLength) > $(maxEditDistance))
1✔
282
      return None
×
283

284
    var minSuggestLen: Double = Double.PositiveInfinity
1✔
285

286
    val suggestDict = MMap.empty[String, (Long, Int)]
1✔
287
    val queueDictionary = MMap.empty[String, String] // items other than string that we've checked
1✔
288
    var queueList = Iterator(lowercaseWord)
1✔
289

290
    while (queueList.hasNext) {
1✔
291
      val queueItem = queueList.next // pop
1✔
292
      val queueItemLength = queueItem.length
1✔
293

294
      breakable { // early exit
1✔
295
        if (suggestDict.nonEmpty && (lowercaseWordLength - queueItemLength) > $(
1✔
296
            maxEditDistance)) {
1✔
297
          break
×
298
        }
299
      }
300

301
      // process queue item
302
      if (allWords.contains(queueItem) && !suggestDict.contains(queueItem)) {
1✔
303

304
        var suggestedWordsWeight: (List[String], Long) =
305
          $$(derivedWords).getOrElse(queueItem, (List(""), 0))
1✔
306

307
        if (suggestedWordsWeight._2 > 0) {
1✔
308
          // word is in dictionary, and is a word from the corpus, and not already in suggestion list
309
          // so add to suggestion dictionary, indexed by the word with value:
310
          // (frequency in corpus, edit distance)
311
          // note q_items that are not the input string are shorter than input string since only
312
          // deletes are added (unless manual dictionary corrections are added)
313
          suggestDict(queueItem) =
1✔
314
            (suggestedWordsWeight._2, lowercaseWordLength - queueItemLength)
1✔
315

316
          breakable { // early exit
1✔
317
            if (lowercaseWordLength == queueItemLength) {
1✔
318
              break
×
319
            }
320
          }
321

322
          if (lowercaseWordLength - queueItemLength < minSuggestLen) {
1✔
323
            minSuggestLen = lowercaseWordLength - queueItemLength
×
324
          }
325
        }
326

327
        // the suggested corrections for q_item as stored in dictionary (whether or not queueItem itself
328
        // is a valid word or merely a delete) can be valid corrections
329
        suggestedWordsWeight._1.foreach(scItem => {
1✔
330
          val lowercaseScItem = scItem.toLowerCase
1✔
331
          if (!suggestDict.contains(lowercaseScItem) && lowercaseScItem != "") {
1✔
332

333
            // calculate edit distance using Damerau-Levenshtein distance
334
            val itemDist = Utilities.levenshteinDistance(lowercaseScItem, lowercaseWord)
1✔
335

336
            if (itemDist <= $(maxEditDistance)) {
1✔
337
              suggestedWordsWeight = $$(derivedWords).getOrElse(lowercaseScItem, (List(""), 0))
1✔
338
              if (suggestedWordsWeight._2 > 0) {
1✔
339
                suggestDict(lowercaseScItem) = (suggestedWordsWeight._2, itemDist)
1✔
340
                if (itemDist < minSuggestLen) {
1✔
341
                  minSuggestLen = itemDist
1✔
342
                }
343
              }
344
            }
345
            // depending on order words are processed, some words with different edit distances may be
346
            // entered into suggestions; trim suggestion dictionary
347
            suggestDict.retain((_, v) => v._2 <= minSuggestLen)
1✔
348
          }
349
        })
350

351
      }
352

353
      // now generate deletes (e.g. a substring of string or of a delete) from the queue item
354
      // do not add words with greater edit distance
355
      if ((lowercaseWordLength - queueItemLength) < $(maxEditDistance) && queueItemLength > 1) {
1✔
356
        val y = 0 until queueItemLength
1✔
357
        y.foreach(c => { // character index
1✔
358
          // result of word minus c
359
          val wordMinus =
360
            queueItem.substring(0, c).concat(queueItem.substring(c + 1, queueItemLength))
1✔
361
          if (!queueDictionary.contains(wordMinus)) {
1✔
362
            queueList ++= Iterator(wordMinus)
1✔
363
            queueDictionary(wordMinus) =
1✔
364
              "None" // arbitrary value, just to identify we checked this
365
          }
366
        }) // End queueItem.foreach
367
      }
368

369
    } // End while
370

371
    // return list of suggestions with (correction, (frequency in corpus, edit distance))
372

373
    val suggestions = suggestDict.toSeq.sortBy { case (k, (f, d)) => (d, -f, k) }.toList
1✔
374
    getSuggestedWord(suggestions.headOption.orElse(None), -1)
1✔
375
  }
376

377
  private def getSuggestedWord(
378
      suggestion: Option[(String, (Long, Int))],
379
      score: Double): Option[SuggestedWord] = {
380
    if (suggestion.isDefined) {
1✔
381
      val realScore =
382
        if (score == -1) suggestion.get._2._2.toDouble / $(maxEditDistance).toDouble else score
1✔
383
      Some(
1✔
384
        SuggestedWord(
1✔
385
          correction = suggestion.get._1,
1✔
386
          frequency = suggestion.get._2._1,
1✔
387
          distance = suggestion.get._2._2,
1✔
388
          score = BigDecimal(realScore).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble))
1✔
389
    } else {
390
      None
1✔
391
    }
392
  }
393

394
}
395

396
trait ReadablePretrainedSymmetric
397
    extends ParamsAndFeaturesReadable[SymmetricDeleteModel]
398
    with HasPretrained[SymmetricDeleteModel] {
399
  override val defaultModelName = Some("spellcheck_sd")
×
400

401
  /** Java compliant-overrides */
402
  override def pretrained(): SymmetricDeleteModel = super.pretrained()
×
403

404
  override def pretrained(name: String): SymmetricDeleteModel = super.pretrained(name)
×
405

406
  override def pretrained(name: String, lang: String): SymmetricDeleteModel =
407
    super.pretrained(name, lang)
×
408

409
  override def pretrained(name: String, lang: String, remoteLoc: String): SymmetricDeleteModel =
410
    super.pretrained(name, lang, remoteLoc)
×
411
}
412

413
/** This is the companion object of [[SymmetricDeleteModel]]. Please refer to that class for the
414
  * documentation.
415
  */
416
object SymmetricDeleteModel extends ReadablePretrainedSymmetric
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc