18652478786

Committed 20 Oct 2025 12:47PM UTC coverage: 55.25% (+0.2%) from 55.094%

Build # 18652478786

Build Type

Pull #14674

github

Committed by

web-flow

Commit Message

Merge b08968fc1 into b827818c7

Pull Request Pull Request #14674: SPARKNLP-1293 Enhancements EntityRuler and DocumentNormalizer

Run Details

114 of 149 new or added lines in 3 files covered. (76.51%)

40 existing lines in 36 files now uncovered.

11919 of 21573 relevant lines covered (55.25%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.55

/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModel.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.spell.symmetric

import com.johnsnowlabs.nlp.annotators.spell.util.Utilities
import com.johnsnowlabs.nlp.serialization.MapFeature
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.util.Identifiable
import org.slf4j.LoggerFactory

import scala.collection.immutable.HashSet
import scala.collection.mutable.{Map => MMap}
import scala.util.control.Breaks._

/** Symmetric Delete spelling correction algorithm.
  *
  * The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
  * generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
  * magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
  * and language independent.
  *
  * Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
  *
  * Pretrained models can be loaded with `pretrained` of the companion object:
  * {{{
  * val spell = SymmetricDeleteModel.pretrained()
  *   .setInputCols("token")
  *   .setOutputCol("spell")
  * }}}
  * The default model is `"spellcheck_sd"`, if no name is provided. For available pretrained
  * models please see the [[https://sparknlp.org/models?task=Spell+Check Models Hub]].
  *
  * See
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
  * for further reference.
  *
  * ==Example==
  * {{{
  * import spark.implicits._
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
  * import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel
  * import org.apache.spark.ml.Pipeline
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("text")
  *   .setOutputCol("document")
  *
  * val tokenizer = new Tokenizer()
  *   .setInputCols("document")
  *   .setOutputCol("token")
  *
  * val spellChecker = SymmetricDeleteModel.pretrained()
  *   .setInputCols("token")
  *   .setOutputCol("spell")
  *
  * val pipeline = new Pipeline().setStages(Array(
  *   documentAssembler,
  *   tokenizer,
  *   spellChecker
  * ))
  *
  * val data = Seq("spmetimes i wrrite wordz erong.").toDF("text")
  * val result = pipeline.fit(data).transform(data)
  * result.select("spell.result").show(false)
  * +--------------------------------------+
  * |result                                |
  * +--------------------------------------+
  * |[sometimes, i, write, words, wrong, .]|
  * +--------------------------------------+
  * }}}
  *
  * @see
  *   [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel NorvigSweetingModel]] for
  *   an alternative approach to spell checking
  * @see
  *   [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel ContextSpellCheckerModel]]
  *   for a DL based approach
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class SymmetricDeleteModel(override val uid: String)
    extends AnnotatorModel[SymmetricDeleteModel]
    with HasSimpleAnnotate[SymmetricDeleteModel]
    with SymmetricDeleteParams {

  import com.johnsnowlabs.nlp.AnnotatorType._

  def this() = this(Identifiable.randomUID("SYMSPELL"))

  /** Output annotator type: TOKEN
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = TOKEN

  /** Input annotator type: TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

  protected val derivedWords: MapFeature[String, (List[String], Long)] =
    new MapFeature(this, "derivedWords")

  protected val dictionary: MapFeature[String, Long] = new MapFeature(this, "dictionary")

  /** @group setParam */
  def setDictionary(value: Map[String, Long]): this.type = set(dictionary, value)

  /** @group setParam */
  def setDerivedWords(value: Map[String, (List[String], Long)]): this.type =
    set(derivedWords, value)

  private val logger = LoggerFactory.getLogger("SymmetricDeleteApproach")

  private lazy val allWords: HashSet[String] = {
    HashSet($$(derivedWords).keys.toSeq.map(_.toLowerCase): _*)
  }

  private val CAPITAL = 'C'
  private val LOWERCASE = 'L'
  private val UPPERCASE = 'U'

  case class SuggestedWord(correction: String, frequency: Long, distance: Int, score: Double)

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    annotations.map { token =>
      {
        val verifiedWord = checkSpellWord(token.result)
        Annotation(
          outputAnnotatorType,
          token.begin,
          token.end,
          verifiedWord._1,
          Map("confidence" -> verifiedWord._2.toString))
      }
    }
  }

  def checkSpellWord(originalWord: String): (String, Double) = {
    logger.debug(s"spell checker target word: $originalWord")
    var score: Double = 0
    if (isNoisyWord(originalWord)) {
      return (originalWord, score)
    }
    var transformedWord = originalWord
    val originalCaseType = getCaseWordType(originalWord)
    val suggestedWord = getSuggestedCorrections(originalWord)
    if (suggestedWord.isDefined) {
      logger.debug(
        s"Received: $originalWord. Best correction is: $suggestedWord. " +
          s"Because frequency was ${suggestedWord.get.frequency} " +
          s"and edit distance was ${suggestedWord.get.distance}")
      transformedWord =
        transformToOriginalCaseType(originalCaseType, suggestedWord.get.correction)
      score = suggestedWord.get.score
    }

    (transformedWord, score)
  }

  def isNoisyWord(word: String): Boolean = {
    val noisyWordRegex = "[^a-zA-Z]".r
    val matchNoisyWord = noisyWordRegex.findFirstMatchIn(word)

    if (matchNoisyWord.isEmpty) {
      false
    } else {
      true
    }
  }

  def getCaseWordType(word: String): Char = {
    val firstLetter = word(0).toString
    val matchUpperCaseFirstLetter = "[A-Z]".r.findFirstMatchIn(firstLetter)

    var caseType = UPPERCASE

    word.foreach { letter =>
      val matchUpperCase = "[A-Z]".r.findFirstMatchIn(letter.toString)
      if (matchUpperCase.isEmpty) {
        if (matchUpperCaseFirstLetter.nonEmpty) {
          caseType = CAPITAL
        } else {
          caseType = LOWERCASE
        }
      }
    }

    caseType
  }

  def transformToOriginalCaseType(caseType: Char, word: String): String = {

    var transformedWord = word

    if (caseType == CAPITAL) {
      val firstLetter = word(0).toString
      transformedWord = word.replaceFirst(firstLetter, firstLetter.toUpperCase)
    } else if (caseType == UPPERCASE) {
      transformedWord = word.toUpperCase
    }
    transformedWord
  }

  /** Return list of suggested corrections for potentially incorrectly spelled word */
  def getSuggestedCorrections(word: String): Option[SuggestedWord] = {
    val cleanWord = Utilities.limitDuplicates($(dupsLimit), word)
    if (get(dictionary).isDefined) {
      getDictionarySuggestions(cleanWord)
    } else {
      getSymmetricSuggestions(cleanWord)
    }
  }

  def getDictionarySuggestions(word: String): Option[SuggestedWord] = {
    if ($$(dictionary).contains(word)) {
      logger.debug("Word found in dictionary. No spell change")
      val score = getScoreFrequency(word)
      getSuggestedWord(Some((word, (0, 0))), score)
    } else if ($$(dictionary).contains(word.distinct)) {
      logger.debug("Word as distinct found in dictionary")
      val score = getScoreFrequency(word.distinct)
      getSuggestedWord(Some((word.distinct, (0, 0))), score)
    } else
      getSymmetricSuggestions(word)
  }

  def getScoreFrequency(word: String): Double = {
    val frequency = Utilities.getFrequency(word, $$(dictionary))
    normalizeFrequencyValue(frequency)
  }

  def normalizeFrequencyValue(value: Long): Double = {
    if (value > $(maxFrequency)) {
      return 1
    }
    if (value < $(minFrequency)) {
      return 0
    }
    if ($(maxFrequency) == $(minFrequency)) {
      return 1
    }
    val normalizedValue =
      (value - $(maxFrequency)).toDouble / ($(maxFrequency) - $(minFrequency)).toDouble
    BigDecimal(normalizedValue).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble
  }

  def getSymmetricSuggestions(word: String): Option[SuggestedWord] = {
    val lowercaseWord = word.toLowerCase()
    val lowercaseWordLength = lowercaseWord.length
    if (lowercaseWordLength - $(longestWordLength) > $(maxEditDistance))
      return None

    var minSuggestLen: Double = Double.PositiveInfinity

    val suggestDict = MMap.empty[String, (Long, Int)]
    val queueDictionary = MMap.empty[String, String] // items other than string that we've checked
    var queueList = Iterator(lowercaseWord)

    while (queueList.hasNext) {
      val queueItem = queueList.next // pop
      val queueItemLength = queueItem.length

      breakable { // early exit
        if (suggestDict.nonEmpty && (lowercaseWordLength - queueItemLength) > $(
            maxEditDistance)) {
          break
        }
      }

      // process queue item
      if (allWords.contains(queueItem) && !suggestDict.contains(queueItem)) {

        var suggestedWordsWeight: (List[String], Long) =
          $$(derivedWords).getOrElse(queueItem, (List(""), 0))

        if (suggestedWordsWeight._2 > 0) {
          // word is in dictionary, and is a word from the corpus, and not already in suggestion list
          // so add to suggestion dictionary, indexed by the word with value:
          // (frequency in corpus, edit distance)
          // note q_items that are not the input string are shorter than input string since only
          // deletes are added (unless manual dictionary corrections are added)
          suggestDict(queueItem) =
            (suggestedWordsWeight._2, lowercaseWordLength - queueItemLength)

          breakable { // early exit
            if (lowercaseWordLength == queueItemLength) {
              break
            }
          }

          if (lowercaseWordLength - queueItemLength < minSuggestLen) {
            minSuggestLen = lowercaseWordLength - queueItemLength
          }
        }

        // the suggested corrections for q_item as stored in dictionary (whether or not queueItem itself
        // is a valid word or merely a delete) can be valid corrections
        suggestedWordsWeight._1.foreach(scItem => {
          val lowercaseScItem = scItem.toLowerCase
          if (!suggestDict.contains(lowercaseScItem) && lowercaseScItem != "") {

            // calculate edit distance using Damerau-Levenshtein distance
            val itemDist = Utilities.levenshteinDistance(lowercaseScItem, lowercaseWord)

            if (itemDist <= $(maxEditDistance)) {
              suggestedWordsWeight = $$(derivedWords).getOrElse(lowercaseScItem, (List(""), 0))
              if (suggestedWordsWeight._2 > 0) {
                suggestDict(lowercaseScItem) = (suggestedWordsWeight._2, itemDist)
                if (itemDist < minSuggestLen) {
                  minSuggestLen = itemDist
                }
              }
            }
            // depending on order words are processed, some words with different edit distances may be
            // entered into suggestions; trim suggestion dictionary
            suggestDict.retain((_, v) => v._2 <= minSuggestLen)
          }
        })

      }

      // now generate deletes (e.g. a substring of string or of a delete) from the queue item
      // do not add words with greater edit distance
      if ((lowercaseWordLength - queueItemLength) < $(maxEditDistance) && queueItemLength > 1) {
        val y = 0 until queueItemLength
        y.foreach(c => { // character index
          // result of word minus c
          val wordMinus =
            queueItem.substring(0, c).concat(queueItem.substring(c + 1, queueItemLength))
          if (!queueDictionary.contains(wordMinus)) {
            queueList ++= Iterator(wordMinus)
            queueDictionary(wordMinus) =
              "None" // arbitrary value, just to identify we checked this
          }
        }) // End queueItem.foreach
      }

    } // End while

    // return list of suggestions with (correction, (frequency in corpus, edit distance))

    val suggestions = suggestDict.toSeq.sortBy { case (k, (f, d)) => (d, -f, k) }.toList
    getSuggestedWord(suggestions.headOption.orElse(None), -1)
  }

  private def getSuggestedWord(
      suggestion: Option[(String, (Long, Int))],
      score: Double): Option[SuggestedWord] = {
    if (suggestion.isDefined) {
      val realScore =
        if (score == -1) suggestion.get._2._2.toDouble / $(maxEditDistance).toDouble else score
      Some(
        SuggestedWord(
          correction = suggestion.get._1,
          frequency = suggestion.get._2._1,
          distance = suggestion.get._2._2,
          score = BigDecimal(realScore).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble))
    } else {
      None
    }
  }

}

trait ReadablePretrainedSymmetric
    extends ParamsAndFeaturesReadable[SymmetricDeleteModel]
    with HasPretrained[SymmetricDeleteModel] {
  override val defaultModelName = Some("spellcheck_sd")

  /** Java compliant-overrides */
  override def pretrained(): SymmetricDeleteModel = super.pretrained()

  override def pretrained(name: String): SymmetricDeleteModel = super.pretrained(name)

  override def pretrained(name: String, lang: String): SymmetricDeleteModel =
    super.pretrained(name, lang)

  override def pretrained(name: String, lang: String, remoteLoc: String): SymmetricDeleteModel =
    super.pretrained(name, lang, remoteLoc)
}

/** This is the companion object of [[SymmetricDeleteModel]]. Please refer to that class for the
  * documentation.
  */
object SymmetricDeleteModel extends ReadablePretrainedSymmetric

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.spell.symmetric
18
19	import com.johnsnowlabs.nlp.annotators.spell.util.Utilities
20	import com.johnsnowlabs.nlp.serialization.MapFeature
21	import com.johnsnowlabs.nlp._
22	import org.apache.spark.ml.util.Identifiable
23	import org.slf4j.LoggerFactory
24
25	import scala.collection.immutable.HashSet
26	import scala.collection.mutable.{Map => MMap}
27	import scala.util.control.Breaks._
28
29	/** Symmetric Delete spelling correction algorithm.
30	*
31	* The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
32	* generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
33	* magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
34	* and language independent.
35	*
36	* Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
37	*
38	* Pretrained models can be loaded with `pretrained` of the companion object:
39	* {{{
40	* val spell = SymmetricDeleteModel.pretrained()
41	* .setInputCols("token")
42	* .setOutputCol("spell")
43	* }}}
44	* The default model is `"spellcheck_sd"`, if no name is provided. For available pretrained
45	* models please see the [[https://sparknlp.org/models?task=Spell+Check Models Hub]].
46	*
47	* See
48	* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
49	* for further reference.
50	*
51	* ==Example==
52	* {{{
53	* import spark.implicits._
54	* import com.johnsnowlabs.nlp.base.DocumentAssembler
55	* import com.johnsnowlabs.nlp.annotators.Tokenizer
56	* import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel
57	* import org.apache.spark.ml.Pipeline
58	*
59	* val documentAssembler = new DocumentAssembler()
60	* .setInputCol("text")
61	* .setOutputCol("document")
62	*
63	* val tokenizer = new Tokenizer()
64	* .setInputCols("document")
65	* .setOutputCol("token")
66	*
67	* val spellChecker = SymmetricDeleteModel.pretrained()
68	* .setInputCols("token")
69	* .setOutputCol("spell")
70	*
71	* val pipeline = new Pipeline().setStages(Array(
72	* documentAssembler,
73	* tokenizer,
74	* spellChecker
75	* ))
76	*
77	* val data = Seq("spmetimes i wrrite wordz erong.").toDF("text")
78	* val result = pipeline.fit(data).transform(data)
79	* result.select("spell.result").show(false)
80	* +--------------------------------------+
81	* \|result \|
82	* +--------------------------------------+
83	* \|[sometimes, i, write, words, wrong, .]\|
84	* +--------------------------------------+
85	* }}}
86	*
87	* @see
88	* [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel NorvigSweetingModel]] for
89	* an alternative approach to spell checking
90	* @see
91	* [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel ContextSpellCheckerModel]]
92	* for a DL based approach
93	* @groupname anno Annotator types
94	* @groupdesc anno
95	* Required input and expected output annotator types
96	* @groupname Ungrouped Members
97	* @groupname param Parameters
98	* @groupname setParam Parameter setters
99	* @groupname getParam Parameter getters
100	* @groupname Ungrouped Members
101	* @groupprio param 1
102	* @groupprio anno 2
103	* @groupprio Ungrouped 3
104	* @groupprio setParam 4
105	* @groupprio getParam 5
106	* @groupdesc param
107	* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
108	* parameter values through setters and getters, respectively.
109	*/
110	class SymmetricDeleteModel(override val uid: String)
111	extends AnnotatorModel[SymmetricDeleteModel]
112	with HasSimpleAnnotate[SymmetricDeleteModel]
113	with SymmetricDeleteParams {
114
115	import com.johnsnowlabs.nlp.AnnotatorType._
116
117	def this() = this(Identifiable.randomUID("SYMSPELL"))	1✔
118
119	/** Output annotator type: TOKEN
120	*
121	* @group anno
122	*/
123	override val outputAnnotatorType: AnnotatorType = TOKEN	1✔
124
125	/** Input annotator type: TOKEN
126	*
127	* @group anno
128	*/
129	override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)	1✔
130
131	protected val derivedWords: MapFeature[String, (List[String], Long)] =
132	new MapFeature(this, "derivedWords")	1✔
133
134	protected val dictionary: MapFeature[String, Long] = new MapFeature(this, "dictionary")	1✔
135
136	/** @group setParam */
137	def setDictionary(value: Map[String, Long]): this.type = set(dictionary, value)	1✔
138
139	/** @group setParam */
140	def setDerivedWords(value: Map[String, (List[String], Long)]): this.type =
141	set(derivedWords, value)	1✔
142
143	private val logger = LoggerFactory.getLogger("SymmetricDeleteApproach")	1✔
144
145	private lazy val allWords: HashSet[String] = {
146	HashSet($$(derivedWords).keys.toSeq.map(_.toLowerCase): _*)
147	}
148
149	private val CAPITAL = 'C'	1✔
150	private val LOWERCASE = 'L'	1✔
151	private val UPPERCASE = 'U'	1✔
152
153	case class SuggestedWord(correction: String, frequency: Long, distance: Int, score: Double)
154
155	override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
156	annotations.map { token =>	1✔
157	{
158	val verifiedWord = checkSpellWord(token.result)	1✔
159	Annotation(	1✔
160	outputAnnotatorType,	1✔
161	token.begin,	1✔
162	token.end,	1✔
163	verifiedWord._1,	1✔
164	Map("confidence" -> verifiedWord._2.toString))	1✔
165	}
166	}
167	}
168
169	def checkSpellWord(originalWord: String): (String, Double) = {
170	logger.debug(s"spell checker target word: $originalWord")	1✔
171	var score: Double = 0	1✔
172	if (isNoisyWord(originalWord)) {	1✔
173	return (originalWord, score)	1✔
174	}
175	var transformedWord = originalWord
176	val originalCaseType = getCaseWordType(originalWord)	1✔
177	val suggestedWord = getSuggestedCorrections(originalWord)	1✔
178	if (suggestedWord.isDefined) {	1✔
179	logger.debug(	1✔
180	s"Received: $originalWord. Best correction is: $suggestedWord. " +	1✔
181	s"Because frequency was ${suggestedWord.get.frequency} " +	1✔
182	s"and edit distance was ${suggestedWord.get.distance}")	1✔
183	transformedWord =
184	transformToOriginalCaseType(originalCaseType, suggestedWord.get.correction)	1✔
185	score = suggestedWord.get.score	1✔
186	}
187
188	(transformedWord, score)	1✔
189	}
190
191	def isNoisyWord(word: String): Boolean = {
192	val noisyWordRegex = "[^a-zA-Z]".r	1✔
193	val matchNoisyWord = noisyWordRegex.findFirstMatchIn(word)	1✔
194
195	if (matchNoisyWord.isEmpty) {	1✔
196	false	1✔
197	} else {
198	true	1✔
199	}
200	}
201
202	def getCaseWordType(word: String): Char = {
203	val firstLetter = word(0).toString	1✔
204	val matchUpperCaseFirstLetter = "[A-Z]".r.findFirstMatchIn(firstLetter)	1✔
205
206	var caseType = UPPERCASE	1✔
207
208	word.foreach { letter =>	1✔
209	val matchUpperCase = "[A-Z]".r.findFirstMatchIn(letter.toString)	1✔
210	if (matchUpperCase.isEmpty) {	1✔
211	if (matchUpperCaseFirstLetter.nonEmpty) {	1✔
212	caseType = CAPITAL	1✔
213	} else {
214	caseType = LOWERCASE	1✔
215	}
216	}
217	}
218
219	caseType
220	}
221
222	def transformToOriginalCaseType(caseType: Char, word: String): String = {
223
224	var transformedWord = word
225
226	if (caseType == CAPITAL) {	1✔
227	val firstLetter = word(0).toString	1✔
228	transformedWord = word.replaceFirst(firstLetter, firstLetter.toUpperCase)	1✔
229	} else if (caseType == UPPERCASE) {	1✔
230	transformedWord = word.toUpperCase	1✔
231	}
232	transformedWord
233	}
234
235	/** Return list of suggested corrections for potentially incorrectly spelled word */
236	def getSuggestedCorrections(word: String): Option[SuggestedWord] = {
237	val cleanWord = Utilities.limitDuplicates($(dupsLimit), word)	1✔
238	if (get(dictionary).isDefined) {	1✔
239	getDictionarySuggestions(cleanWord)	1✔
240	} else {
241	getSymmetricSuggestions(cleanWord)	1✔
242	}
243	}
244
245	def getDictionarySuggestions(word: String): Option[SuggestedWord] = {
246	if ($$(dictionary).contains(word)) {	1✔
247	logger.debug("Word found in dictionary. No spell change")	1✔
248	val score = getScoreFrequency(word)	1✔
249	getSuggestedWord(Some((word, (0, 0))), score)	1✔
250	} else if ($$(dictionary).contains(word.distinct)) {	1✔
251	logger.debug("Word as distinct found in dictionary")	1✔
252	val score = getScoreFrequency(word.distinct)	1✔
253	getSuggestedWord(Some((word.distinct, (0, 0))), score)	1✔
254	} else
255	getSymmetricSuggestions(word)	1✔
256	}
257
258	def getScoreFrequency(word: String): Double = {
259	val frequency = Utilities.getFrequency(word, $$(dictionary))	1✔
260	normalizeFrequencyValue(frequency)	1✔
261	}
262
263	def normalizeFrequencyValue(value: Long): Double = {
264	if (value > $(maxFrequency)) {	1✔
265	return 1	×
266	}
267	if (value < $(minFrequency)) {	1✔
268	return 0	×
269	}
UNCOV 270	if ($(maxFrequency) == $(minFrequency)) {	×
271	return 1	1✔
272	}
273	val normalizedValue =
274	(value - $(maxFrequency)).toDouble / ($(maxFrequency) - $(minFrequency)).toDouble	×
275	BigDecimal(normalizedValue).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble	×
276	}
277
278	def getSymmetricSuggestions(word: String): Option[SuggestedWord] = {
279	val lowercaseWord = word.toLowerCase()	1✔
280	val lowercaseWordLength = lowercaseWord.length	1✔
281	if (lowercaseWordLength - $(longestWordLength) > $(maxEditDistance))	1✔
282	return None	×
283
284	var minSuggestLen: Double = Double.PositiveInfinity	1✔
285
286	val suggestDict = MMap.empty[String, (Long, Int)]	1✔
287	val queueDictionary = MMap.empty[String, String] // items other than string that we've checked	1✔
288	var queueList = Iterator(lowercaseWord)	1✔
289
290	while (queueList.hasNext) {	1✔
291	val queueItem = queueList.next // pop	1✔
292	val queueItemLength = queueItem.length	1✔
293
294	breakable { // early exit	1✔
295	if (suggestDict.nonEmpty && (lowercaseWordLength - queueItemLength) > $(	1✔
296	maxEditDistance)) {	1✔
297	break	×
298	}
299	}
300
301	// process queue item
302	if (allWords.contains(queueItem) && !suggestDict.contains(queueItem)) {	1✔
303
304	var suggestedWordsWeight: (List[String], Long) =
305	$$(derivedWords).getOrElse(queueItem, (List(""), 0))	1✔
306
307	if (suggestedWordsWeight._2 > 0) {	1✔
308	// word is in dictionary, and is a word from the corpus, and not already in suggestion list
309	// so add to suggestion dictionary, indexed by the word with value:
310	// (frequency in corpus, edit distance)
311	// note q_items that are not the input string are shorter than input string since only
312	// deletes are added (unless manual dictionary corrections are added)
313	suggestDict(queueItem) =	1✔
314	(suggestedWordsWeight._2, lowercaseWordLength - queueItemLength)	1✔
315
316	breakable { // early exit	1✔
317	if (lowercaseWordLength == queueItemLength) {	1✔
318	break	×
319	}
320	}
321
322	if (lowercaseWordLength - queueItemLength < minSuggestLen) {	1✔
323	minSuggestLen = lowercaseWordLength - queueItemLength	×
324	}
325	}
326
327	// the suggested corrections for q_item as stored in dictionary (whether or not queueItem itself
328	// is a valid word or merely a delete) can be valid corrections
329	suggestedWordsWeight._1.foreach(scItem => {	1✔
330	val lowercaseScItem = scItem.toLowerCase	1✔
331	if (!suggestDict.contains(lowercaseScItem) && lowercaseScItem != "") {	1✔
332
333	// calculate edit distance using Damerau-Levenshtein distance
334	val itemDist = Utilities.levenshteinDistance(lowercaseScItem, lowercaseWord)	1✔
335
336	if (itemDist <= $(maxEditDistance)) {	1✔
337	suggestedWordsWeight = $$(derivedWords).getOrElse(lowercaseScItem, (List(""), 0))	1✔
338	if (suggestedWordsWeight._2 > 0) {	1✔
339	suggestDict(lowercaseScItem) = (suggestedWordsWeight._2, itemDist)	1✔
340	if (itemDist < minSuggestLen) {	1✔
341	minSuggestLen = itemDist	1✔
342	}
343	}
344	}
345	// depending on order words are processed, some words with different edit distances may be
346	// entered into suggestions; trim suggestion dictionary
347	suggestDict.retain((_, v) => v._2 <= minSuggestLen)	1✔
348	}
349	})
350
351	}
352
353	// now generate deletes (e.g. a substring of string or of a delete) from the queue item
354	// do not add words with greater edit distance
355	if ((lowercaseWordLength - queueItemLength) < $(maxEditDistance) && queueItemLength > 1) {	1✔
356	val y = 0 until queueItemLength	1✔
357	y.foreach(c => { // character index	1✔
358	// result of word minus c
359	val wordMinus =
360	queueItem.substring(0, c).concat(queueItem.substring(c + 1, queueItemLength))	1✔
361	if (!queueDictionary.contains(wordMinus)) {	1✔
362	queueList ++= Iterator(wordMinus)	1✔
363	queueDictionary(wordMinus) =	1✔
364	"None" // arbitrary value, just to identify we checked this
365	}
366	}) // End queueItem.foreach
367	}
368
369	} // End while
370
371	// return list of suggestions with (correction, (frequency in corpus, edit distance))
372
373	val suggestions = suggestDict.toSeq.sortBy { case (k, (f, d)) => (d, -f, k) }.toList	1✔
374	getSuggestedWord(suggestions.headOption.orElse(None), -1)	1✔
375	}
376
377	private def getSuggestedWord(
378	suggestion: Option[(String, (Long, Int))],
379	score: Double): Option[SuggestedWord] = {
380	if (suggestion.isDefined) {	1✔
381	val realScore =
382	if (score == -1) suggestion.get._2._2.toDouble / $(maxEditDistance).toDouble else score	1✔
383	Some(	1✔
384	SuggestedWord(	1✔
385	correction = suggestion.get._1,	1✔
386	frequency = suggestion.get._2._1,	1✔
387	distance = suggestion.get._2._2,	1✔
388	score = BigDecimal(realScore).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble))	1✔
389	} else {
390	None	1✔
391	}
392	}
393
394	}
395
396	trait ReadablePretrainedSymmetric
397	extends ParamsAndFeaturesReadable[SymmetricDeleteModel]
398	with HasPretrained[SymmetricDeleteModel] {
399	override val defaultModelName = Some("spellcheck_sd")	×
400
401	/** Java compliant-overrides */
402	override def pretrained(): SymmetricDeleteModel = super.pretrained()	×
403
404	override def pretrained(name: String): SymmetricDeleteModel = super.pretrained(name)	×
405
406	override def pretrained(name: String, lang: String): SymmetricDeleteModel =
407	super.pretrained(name, lang)	×
408
409	override def pretrained(name: String, lang: String, remoteLoc: String): SymmetricDeleteModel =
410	super.pretrained(name, lang, remoteLoc)	×
411	}
412
413	/** This is the companion object of [[SymmetricDeleteModel]]. Please refer to that class for the
414	* documentation.
415	*/
416	object SymmetricDeleteModel extends ReadablePretrainedSymmetric

JohnSnowLabs / spark-nlp / 18652478786

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous