• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 13883000244

16 Mar 2025 11:44AM CUT coverage: 59.034% (-1.0%) from 60.072%
13883000244

Pull #14444

github

web-flow
Merge 6d717703b into 05000ab4a
Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.32
/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteApproach.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.spell.symmetric
18

19
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
20
import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._
21
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
22
import com.johnsnowlabs.nlp.{Annotation, AnnotatorApproach}
23
import org.apache.spark.ml.PipelineModel
24
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
25
import org.apache.spark.sql.functions._
26
import org.apache.spark.sql.{AnalysisException, Dataset}
27

28
import scala.collection.mutable.ListBuffer
29

30
/** Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens and utilizes
31
  * distance metrics to compute possible derived words.
32
  *
33
  * The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
34
  * generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
35
  * magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
36
  * and language independent. A dictionary of correct spellings must be provided with
37
  * `setDictionary` either in the form of a text file or directly as an
38
  * [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]], where each word is parsed
39
  * by a regex pattern.
40
  *
41
  * Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
42
  *
43
  * For instantiated/pretrained models, see [[SymmetricDeleteModel]].
44
  *
45
  * See
46
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
47
  * for further reference.
48
  *
49
  * ==Example==
50
  * In this example, the dictionary `"words.txt"` has the form of
51
  * {{{
52
  * ...
53
  * gummy
54
  * gummic
55
  * gummier
56
  * gummiest
57
  * gummiferous
58
  * ...
59
  * }}}
60
  * This dictionary is then set to be the basis of the spell checker.
61
  * {{{
62
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
63
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
64
  * import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach
65
  * import org.apache.spark.ml.Pipeline
66
  *
67
  * val documentAssembler = new DocumentAssembler()
68
  *   .setInputCol("text")
69
  *   .setOutputCol("document")
70
  *
71
  * val tokenizer = new Tokenizer()
72
  *   .setInputCols("document")
73
  *   .setOutputCol("token")
74
  *
75
  * val spellChecker = new SymmetricDeleteApproach()
76
  *   .setInputCols("token")
77
  *   .setOutputCol("spell")
78
  *   .setDictionary("src/test/resources/spell/words.txt")
79
  *
80
  * val pipeline = new Pipeline().setStages(Array(
81
  *   documentAssembler,
82
  *   tokenizer,
83
  *   spellChecker
84
  * ))
85
  *
86
  * val pipelineModel = pipeline.fit(trainingData)
87
  * }}}
88
  *
89
  * @see
90
  *   [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach NorvigSweetingApproach]]
91
  *   for an alternative approach to spell checking
92
  * @see
93
  *   [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach ContextSpellCheckerApproach]]
94
  *   for a DL based approach
95
  * @groupname anno Annotator types
96
  * @groupdesc anno
97
  *   Required input and expected output annotator types
98
  * @groupname Ungrouped Members
99
  * @groupname param Parameters
100
  * @groupname setParam Parameter setters
101
  * @groupname getParam Parameter getters
102
  * @groupname Ungrouped Members
103
  * @groupprio param  1
104
  * @groupprio anno  2
105
  * @groupprio Ungrouped 3
106
  * @groupprio setParam  4
107
  * @groupprio getParam  5
108
  * @groupdesc param
109
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
110
  *   parameter values through setters and getters, respectively.
111
  */
112
class SymmetricDeleteApproach(override val uid: String)
113
    extends AnnotatorApproach[SymmetricDeleteModel]
114
    with SymmetricDeleteParams {
115

116
  import com.johnsnowlabs.nlp.AnnotatorType._
117

118
  /** Spell checking algorithm inspired on Symmetric Delete algorithm */
119
  override val description: String =
120
    "Spell checking algorithm inspired on Symmetric Delete algorithm"
1✔
121

122
  /** Optional dictionary of properly written words. If provided, significantly boosts spell
123
    * checking performance.
124
    *
125
    * Needs `"tokenPattern"` (Default: `\S+`) for parsing the resource.
126
    * ==Example==
127
    * {{{
128
    * ...
129
    * gummy
130
    * gummic
131
    * gummier
132
    * gummiest
133
    * gummiferous
134
    * ...
135
    * }}}
136
    *
137
    * @group param
138
    */
139
  val dictionary =
140
    new ExternalResourceParam(this, "dictionary", "file with a list of correct words")
1✔
141

142
  setDefault(frequencyThreshold -> 0, deletesThreshold -> 0, maxEditDistance -> 3, dupsLimit -> 2)
1✔
143

144
  /** External dictionary already in the form of [[ExternalResource]], for which the Map member
145
    * `options` has an entry defined for `"tokenPattern"`.
146
    * ==Example==
147
    * {{{
148
    * val resource = ExternalResource(
149
    *   "src/test/resources/spell/words.txt",
150
    *   ReadAs.TEXT,
151
    *   Map("tokenPattern" -> "\\S+")
152
    * )
153
    * val spellChecker = new SymmetricDeleteApproach()
154
    *   .setInputCols("token")
155
    *   .setOutputCol("spell")
156
    *   .setDictionary(resource)
157
    * }}}
158
    *
159
    * @group setParam
160
    */
161
  def setDictionary(value: ExternalResource): this.type = {
162
    require(
×
163
      value.options.contains("tokenPattern"),
×
164
      "dictionary needs 'tokenPattern' regex in dictionary for separating words")
×
165
    set(dictionary, value)
×
166
  }
167

168
  /** Path to file with properly spelled words, `tokenPattern` is the regex pattern to identify
169
    * them in text, readAs can be `ReadAs.TEXT` or `ReadAs.SPARK`, with options passed to Spark
170
    * reader if the latter is set. Dictionary needs `tokenPattern` regex for separating words.
171
    *
172
    * @group setParam
173
    */
174
  def setDictionary(
175
      path: String,
176
      tokenPattern: String = "\\S+",
177
      readAs: ReadAs.Format = ReadAs.TEXT,
178
      options: Map[String, String] = Map("format" -> "text")): this.type =
179
    set(
1✔
180
      dictionary,
1✔
181
      ExternalResource(path, readAs, options ++ Map("tokenPattern" -> tokenPattern)))
1✔
182

183
  /** Output annotator type : TOKEN
184
    *
185
    * @group anno
186
    */
187
  override val outputAnnotatorType: AnnotatorType = TOKEN
1✔
188

189
  /** Input annotator type : TOKEN
190
    *
191
    * @group anno
192
    */
193
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)
1✔
194

195
  def this() =
196
    this(
1✔
197
      Identifiable.randomUID("SYMSPELL")
1✔
198
    ) // constructor required for the annotator to work in python
199

200
  /** Given a word, derive strings with up to maxEditDistance characters deleted */
201
  def getDeletes(word: String, med: Int): List[String] = {
202

203
    var deletes = new ListBuffer[String]()
1✔
204
    var queueList = List(word)
1✔
205
    val x = 1 to med
1✔
206
    x.foreach(_ => {
1✔
207
      var tempQueue = new ListBuffer[String]()
1✔
208
      queueList.foreach(w => {
1✔
209
        if (w.length > 1) {
1✔
210
          val y = 0 until w.length
1✔
211
          y.foreach(c => { // character index
1✔
212
            // result of word minus c
213
            val wordMinus = w.substring(0, c).concat(w.substring(c + 1, w.length))
1✔
214
            if (!deletes.contains(wordMinus)) {
1✔
215
              deletes += wordMinus
1✔
216
            }
217
            if (!tempQueue.contains(wordMinus)) {
1✔
218
              tempQueue += wordMinus
1✔
219
            }
220
          }) // End y.foreach
221
          queueList = tempQueue.toList
1✔
222
        }
223
      }) // End queueList.foreach
224
    }) // End x.foreach
225

226
    deletes.toList
1✔
227
  }
228

229
  /** Computes derived words from a frequency of words */
230
  def derivedWordDistances(
231
      wordFrequencies: List[(String, Long)],
232
      maxEditDistance: Int): Map[String, (List[String], Long)] = {
233

234
    val derivedWords = scala.collection.mutable.Map(wordFrequencies.map { a =>
1✔
235
      (a._1, (ListBuffer.empty[String], a._2))
1✔
236
    }: _*)
237

238
    wordFrequencies.foreach { case (word, _) =>
1✔
239
      val deletes = getDeletes(word, maxEditDistance)
1✔
240

241
      deletes.foreach(deleteItem => {
1✔
242
        if (derivedWords.contains(deleteItem)) {
1✔
243
          // add (correct) word to delete's suggested correction list
244
          derivedWords(deleteItem)._1 += word
1✔
245
        } else {
246
          // note frequency of word in corpus is not incremented
247
          derivedWords(deleteItem) = (ListBuffer(word), 0L)
1✔
248
        }
249
      }) // End deletes.foreach
250
    }
251
    derivedWords
252
      .filterKeys(a => derivedWords(a)._1.length >= $(deletesThreshold))
1✔
253
      .mapValues(derivedWords => (derivedWords._1.toList, derivedWords._2))
1✔
254
      .toMap
1✔
255
  }
256

257
  override def train(
258
      dataset: Dataset[_],
259
      recursivePipeline: Option[PipelineModel]): SymmetricDeleteModel = {
260

UNCOV
261
    require(!dataset.rdd.isEmpty(), "Dataset for training is empty")
×
262

263
    validateDataSet(dataset)
1✔
264

265
    val possibleDict = get(dictionary).map(d => ResourceHelper.getWordCount(d))
1✔
266

267
    val trainDataSet =
268
      dataset
269
        .select(getInputCols.head)
1✔
270
        .as[Array[Annotation]]
1✔
271
        .flatMap(_.map(_.result))
1✔
272

273
    val wordFrequencies =
274
      trainDataSet
275
        .groupBy("value")
276
        .count()
277
        .filter(s"count(value) >= ${$(frequencyThreshold)}")
278
        .as[(String, Long)]
279
        .collect
1✔
280
        .toList
1✔
281

282
    val derivedWords =
283
      derivedWordDistances(wordFrequencies, $(maxEditDistance))
1✔
284

285
    val longestWordLength =
286
      trainDataSet.agg(max(length(col("value")))).head().getInt(0)
1✔
287

288
    val model =
289
      new SymmetricDeleteModel()
290
        .setDerivedWords(derivedWords)
291
        .setLongestWordLength(longestWordLength)
1✔
292

293
    if (possibleDict.isDefined) {
1✔
294
      val min = wordFrequencies.minBy(_._2)._2
1✔
295
      val max = wordFrequencies.maxBy(_._2)._2
1✔
296
      model.setMinFrequency(min)
1✔
297
      model.setMaxFrequency(max)
1✔
298
      model.setDictionary(possibleDict.get.toMap)
1✔
299
    }
300

301
    model
302
  }
303

304
  private def validateDataSet(dataset: Dataset[_]): Unit = {
305
    try {
306
      dataset.select(getInputCols.head).as[Array[Annotation]]
1✔
307
    } catch {
308
      case exception: AnalysisException =>
309
        if (exception.getMessage == "need an array field but got string;") {
1✔
310
          throw new IllegalArgumentException(
×
311
            "Train dataset must have an array annotation type column")
312
        }
313
        throw exception
1✔
314
    }
315
  }
316

317
}
318
// This objects reads the class' properties, it enables reading the model after it is stored
319

320
/** This is the companion object of [[SymmetricDeleteApproach]]. Please refer to that class for
321
  * the documentation.
322
  */
323
object SymmetricDeleteApproach extends DefaultParamsReadable[SymmetricDeleteApproach]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc