13883000244

Committed 16 Mar 2025 11:44AM UTC coverage: 59.034% (-1.0%) from 60.072%

Build # 13883000244

Build Type

Pull #14444

github

Committed by

web-flow

Commit Message

Merge 6d717703b into 05000ab4a

Pull Request Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

Run Details

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.32

/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteApproach.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.spell.symmetric

import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorApproach}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{AnalysisException, Dataset}

import scala.collection.mutable.ListBuffer

/** Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens and utilizes
  * distance metrics to compute possible derived words.
  *
  * The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
  * generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
  * magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
  * and language independent. A dictionary of correct spellings must be provided with
  * `setDictionary` either in the form of a text file or directly as an
  * [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]], where each word is parsed
  * by a regex pattern.
  *
  * Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
  *
  * For instantiated/pretrained models, see [[SymmetricDeleteModel]].
  *
  * See
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
  * for further reference.
  *
  * ==Example==
  * In this example, the dictionary `"words.txt"` has the form of
  * {{{
  * ...
  * gummy
  * gummic
  * gummier
  * gummiest
  * gummiferous
  * ...
  * }}}
  * This dictionary is then set to be the basis of the spell checker.
  * {{{
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
  * import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach
  * import org.apache.spark.ml.Pipeline
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("text")
  *   .setOutputCol("document")
  *
  * val tokenizer = new Tokenizer()
  *   .setInputCols("document")
  *   .setOutputCol("token")
  *
  * val spellChecker = new SymmetricDeleteApproach()
  *   .setInputCols("token")
  *   .setOutputCol("spell")
  *   .setDictionary("src/test/resources/spell/words.txt")
  *
  * val pipeline = new Pipeline().setStages(Array(
  *   documentAssembler,
  *   tokenizer,
  *   spellChecker
  * ))
  *
  * val pipelineModel = pipeline.fit(trainingData)
  * }}}
  *
  * @see
  *   [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach NorvigSweetingApproach]]
  *   for an alternative approach to spell checking
  * @see
  *   [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach ContextSpellCheckerApproach]]
  *   for a DL based approach
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class SymmetricDeleteApproach(override val uid: String)
    extends AnnotatorApproach[SymmetricDeleteModel]
    with SymmetricDeleteParams {

  import com.johnsnowlabs.nlp.AnnotatorType._

  /** Spell checking algorithm inspired on Symmetric Delete algorithm */
  override val description: String =
    "Spell checking algorithm inspired on Symmetric Delete algorithm"

  /** Optional dictionary of properly written words. If provided, significantly boosts spell
    * checking performance.
    *
    * Needs `"tokenPattern"` (Default: `\S+`) for parsing the resource.
    * ==Example==
    * {{{
    * ...
    * gummy
    * gummic
    * gummier
    * gummiest
    * gummiferous
    * ...
    * }}}
    *
    * @group param
    */
  val dictionary =
    new ExternalResourceParam(this, "dictionary", "file with a list of correct words")

  setDefault(frequencyThreshold -> 0, deletesThreshold -> 0, maxEditDistance -> 3, dupsLimit -> 2)

  /** External dictionary already in the form of [[ExternalResource]], for which the Map member
    * `options` has an entry defined for `"tokenPattern"`.
    * ==Example==
    * {{{
    * val resource = ExternalResource(
    *   "src/test/resources/spell/words.txt",
    *   ReadAs.TEXT,
    *   Map("tokenPattern" -> "\\S+")
    * )
    * val spellChecker = new SymmetricDeleteApproach()
    *   .setInputCols("token")
    *   .setOutputCol("spell")
    *   .setDictionary(resource)
    * }}}
    *
    * @group setParam
    */
  def setDictionary(value: ExternalResource): this.type = {
    require(
      value.options.contains("tokenPattern"),
      "dictionary needs 'tokenPattern' regex in dictionary for separating words")
    set(dictionary, value)
  }

  /** Path to file with properly spelled words, `tokenPattern` is the regex pattern to identify
    * them in text, readAs can be `ReadAs.TEXT` or `ReadAs.SPARK`, with options passed to Spark
    * reader if the latter is set. Dictionary needs `tokenPattern` regex for separating words.
    *
    * @group setParam
    */
  def setDictionary(
      path: String,
      tokenPattern: String = "\\S+",
      readAs: ReadAs.Format = ReadAs.TEXT,
      options: Map[String, String] = Map("format" -> "text")): this.type =
    set(
      dictionary,
      ExternalResource(path, readAs, options ++ Map("tokenPattern" -> tokenPattern)))

  /** Output annotator type : TOKEN
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = TOKEN

  /** Input annotator type : TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

  def this() =
    this(
      Identifiable.randomUID("SYMSPELL")
    ) // constructor required for the annotator to work in python

  /** Given a word, derive strings with up to maxEditDistance characters deleted */
  def getDeletes(word: String, med: Int): List[String] = {

    var deletes = new ListBuffer[String]()
    var queueList = List(word)
    val x = 1 to med
    x.foreach(_ => {
      var tempQueue = new ListBuffer[String]()
      queueList.foreach(w => {
        if (w.length > 1) {
          val y = 0 until w.length
          y.foreach(c => { // character index
            // result of word minus c
            val wordMinus = w.substring(0, c).concat(w.substring(c + 1, w.length))
            if (!deletes.contains(wordMinus)) {
              deletes += wordMinus
            }
            if (!tempQueue.contains(wordMinus)) {
              tempQueue += wordMinus
            }
          }) // End y.foreach
          queueList = tempQueue.toList
        }
      }) // End queueList.foreach
    }) // End x.foreach

    deletes.toList
  }

  /** Computes derived words from a frequency of words */
  def derivedWordDistances(
      wordFrequencies: List[(String, Long)],
      maxEditDistance: Int): Map[String, (List[String], Long)] = {

    val derivedWords = scala.collection.mutable.Map(wordFrequencies.map { a =>
      (a._1, (ListBuffer.empty[String], a._2))
    }: _*)

    wordFrequencies.foreach { case (word, _) =>
      val deletes = getDeletes(word, maxEditDistance)

      deletes.foreach(deleteItem => {
        if (derivedWords.contains(deleteItem)) {
          // add (correct) word to delete's suggested correction list
          derivedWords(deleteItem)._1 += word
        } else {
          // note frequency of word in corpus is not incremented
          derivedWords(deleteItem) = (ListBuffer(word), 0L)
        }
      }) // End deletes.foreach
    }
    derivedWords
      .filterKeys(a => derivedWords(a)._1.length >= $(deletesThreshold))
      .mapValues(derivedWords => (derivedWords._1.toList, derivedWords._2))
      .toMap
  }

  override def train(
      dataset: Dataset[_],
      recursivePipeline: Option[PipelineModel]): SymmetricDeleteModel = {

    require(!dataset.rdd.isEmpty(), "Dataset for training is empty")

    validateDataSet(dataset)

    val possibleDict = get(dictionary).map(d => ResourceHelper.getWordCount(d))

    val trainDataSet =
      dataset
        .select(getInputCols.head)
        .as[Array[Annotation]]
        .flatMap(_.map(_.result))

    val wordFrequencies =
      trainDataSet
        .groupBy("value")
        .count()
        .filter(s"count(value) >= ${$(frequencyThreshold)}")
        .as[(String, Long)]
        .collect
        .toList

    val derivedWords =
      derivedWordDistances(wordFrequencies, $(maxEditDistance))

    val longestWordLength =
      trainDataSet.agg(max(length(col("value")))).head().getInt(0)

    val model =
      new SymmetricDeleteModel()
        .setDerivedWords(derivedWords)
        .setLongestWordLength(longestWordLength)

    if (possibleDict.isDefined) {
      val min = wordFrequencies.minBy(_._2)._2
      val max = wordFrequencies.maxBy(_._2)._2
      model.setMinFrequency(min)
      model.setMaxFrequency(max)
      model.setDictionary(possibleDict.get.toMap)
    }

    model
  }

  private def validateDataSet(dataset: Dataset[_]): Unit = {
    try {
      dataset.select(getInputCols.head).as[Array[Annotation]]
    } catch {
      case exception: AnalysisException =>
        if (exception.getMessage == "need an array field but got string;") {
          throw new IllegalArgumentException(
            "Train dataset must have an array annotation type column")
        }
        throw exception
    }
  }

}
// This objects reads the class' properties, it enables reading the model after it is stored

/** This is the companion object of [[SymmetricDeleteApproach]]. Please refer to that class for
  * the documentation.
  */
object SymmetricDeleteApproach extends DefaultParamsReadable[SymmetricDeleteApproach]

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.spell.symmetric
18
19	import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
20	import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._
21	import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
22	import com.johnsnowlabs.nlp.{Annotation, AnnotatorApproach}
23	import org.apache.spark.ml.PipelineModel
24	import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
25	import org.apache.spark.sql.functions._
26	import org.apache.spark.sql.{AnalysisException, Dataset}
27
28	import scala.collection.mutable.ListBuffer
29
30	/** Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens and utilizes
31	* distance metrics to compute possible derived words.
32	*
33	* The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate
34	* generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of
35	* magnitude faster (than the standard approach with deletes + transposes + replaces + inserts)
36	* and language independent. A dictionary of correct spellings must be provided with
37	* `setDictionary` either in the form of a text file or directly as an
38	* [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]], where each word is parsed
39	* by a regex pattern.
40	*
41	* Inspired by [[https://github.com/wolfgarbe/SymSpell SymSpell]].
42	*
43	* For instantiated/pretrained models, see [[SymmetricDeleteModel]].
44	*
45	* See
46	* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteModelTestSpec.scala SymmetricDeleteModelTestSpec]]
47	* for further reference.
48	*
49	* ==Example==
50	* In this example, the dictionary `"words.txt"` has the form of
51	* {{{
52	* ...
53	* gummy
54	* gummic
55	* gummier
56	* gummiest
57	* gummiferous
58	* ...
59	* }}}
60	* This dictionary is then set to be the basis of the spell checker.
61	* {{{
62	* import com.johnsnowlabs.nlp.base.DocumentAssembler
63	* import com.johnsnowlabs.nlp.annotators.Tokenizer
64	* import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach
65	* import org.apache.spark.ml.Pipeline
66	*
67	* val documentAssembler = new DocumentAssembler()
68	* .setInputCol("text")
69	* .setOutputCol("document")
70	*
71	* val tokenizer = new Tokenizer()
72	* .setInputCols("document")
73	* .setOutputCol("token")
74	*
75	* val spellChecker = new SymmetricDeleteApproach()
76	* .setInputCols("token")
77	* .setOutputCol("spell")
78	* .setDictionary("src/test/resources/spell/words.txt")
79	*
80	* val pipeline = new Pipeline().setStages(Array(
81	* documentAssembler,
82	* tokenizer,
83	* spellChecker
84	* ))
85	*
86	* val pipelineModel = pipeline.fit(trainingData)
87	* }}}
88	*
89	* @see
90	* [[com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach NorvigSweetingApproach]]
91	* for an alternative approach to spell checking
92	* @see
93	* [[com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach ContextSpellCheckerApproach]]
94	* for a DL based approach
95	* @groupname anno Annotator types
96	* @groupdesc anno
97	* Required input and expected output annotator types
98	* @groupname Ungrouped Members
99	* @groupname param Parameters
100	* @groupname setParam Parameter setters
101	* @groupname getParam Parameter getters
102	* @groupname Ungrouped Members
103	* @groupprio param 1
104	* @groupprio anno 2
105	* @groupprio Ungrouped 3
106	* @groupprio setParam 4
107	* @groupprio getParam 5
108	* @groupdesc param
109	* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
110	* parameter values through setters and getters, respectively.
111	*/
112	class SymmetricDeleteApproach(override val uid: String)
113	extends AnnotatorApproach[SymmetricDeleteModel]
114	with SymmetricDeleteParams {
115
116	import com.johnsnowlabs.nlp.AnnotatorType._
117
118	/** Spell checking algorithm inspired on Symmetric Delete algorithm */
119	override val description: String =
120	"Spell checking algorithm inspired on Symmetric Delete algorithm"	1✔
121
122	/** Optional dictionary of properly written words. If provided, significantly boosts spell
123	* checking performance.
124	*
125	* Needs `"tokenPattern"` (Default: `\S+`) for parsing the resource.
126	* ==Example==
127	* {{{
128	* ...
129	* gummy
130	* gummic
131	* gummier
132	* gummiest
133	* gummiferous
134	* ...
135	* }}}
136	*
137	* @group param
138	*/
139	val dictionary =
140	new ExternalResourceParam(this, "dictionary", "file with a list of correct words")	1✔
141
142	setDefault(frequencyThreshold -> 0, deletesThreshold -> 0, maxEditDistance -> 3, dupsLimit -> 2)	1✔
143
144	/** External dictionary already in the form of [[ExternalResource]], for which the Map member
145	* `options` has an entry defined for `"tokenPattern"`.
146	* ==Example==
147	* {{{
148	* val resource = ExternalResource(
149	* "src/test/resources/spell/words.txt",
150	* ReadAs.TEXT,
151	* Map("tokenPattern" -> "\\S+")
152	* )
153	* val spellChecker = new SymmetricDeleteApproach()
154	* .setInputCols("token")
155	* .setOutputCol("spell")
156	* .setDictionary(resource)
157	* }}}
158	*
159	* @group setParam
160	*/
161	def setDictionary(value: ExternalResource): this.type = {
162	require(	×
163	value.options.contains("tokenPattern"),	×
164	"dictionary needs 'tokenPattern' regex in dictionary for separating words")	×
165	set(dictionary, value)	×
166	}
167
168	/** Path to file with properly spelled words, `tokenPattern` is the regex pattern to identify
169	* them in text, readAs can be `ReadAs.TEXT` or `ReadAs.SPARK`, with options passed to Spark
170	* reader if the latter is set. Dictionary needs `tokenPattern` regex for separating words.
171	*
172	* @group setParam
173	*/
174	def setDictionary(
175	path: String,
176	tokenPattern: String = "\\S+",
177	readAs: ReadAs.Format = ReadAs.TEXT,
178	options: Map[String, String] = Map("format" -> "text")): this.type =
179	set(	1✔
180	dictionary,	1✔
181	ExternalResource(path, readAs, options ++ Map("tokenPattern" -> tokenPattern)))	1✔
182
183	/** Output annotator type : TOKEN
184	*
185	* @group anno
186	*/
187	override val outputAnnotatorType: AnnotatorType = TOKEN	1✔
188
189	/** Input annotator type : TOKEN
190	*
191	* @group anno
192	*/
193	override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)	1✔
194
195	def this() =
196	this(	1✔
197	Identifiable.randomUID("SYMSPELL")	1✔
198	) // constructor required for the annotator to work in python
199
200	/** Given a word, derive strings with up to maxEditDistance characters deleted */
201	def getDeletes(word: String, med: Int): List[String] = {
202
203	var deletes = new ListBuffer[String]()	1✔
204	var queueList = List(word)	1✔
205	val x = 1 to med	1✔
206	x.foreach(_ => {	1✔
207	var tempQueue = new ListBuffer[String]()	1✔
208	queueList.foreach(w => {	1✔
209	if (w.length > 1) {	1✔
210	val y = 0 until w.length	1✔
211	y.foreach(c => { // character index	1✔
212	// result of word minus c
213	val wordMinus = w.substring(0, c).concat(w.substring(c + 1, w.length))	1✔
214	if (!deletes.contains(wordMinus)) {	1✔
215	deletes += wordMinus	1✔
216	}
217	if (!tempQueue.contains(wordMinus)) {	1✔
218	tempQueue += wordMinus	1✔
219	}
220	}) // End y.foreach
221	queueList = tempQueue.toList	1✔
222	}
223	}) // End queueList.foreach
224	}) // End x.foreach
225
226	deletes.toList	1✔
227	}
228
229	/** Computes derived words from a frequency of words */
230	def derivedWordDistances(
231	wordFrequencies: List[(String, Long)],
232	maxEditDistance: Int): Map[String, (List[String], Long)] = {
233
234	val derivedWords = scala.collection.mutable.Map(wordFrequencies.map { a =>	1✔
235	(a._1, (ListBuffer.empty[String], a._2))	1✔
236	}: _*)
237
238	wordFrequencies.foreach { case (word, _) =>	1✔
239	val deletes = getDeletes(word, maxEditDistance)	1✔
240
241	deletes.foreach(deleteItem => {	1✔
242	if (derivedWords.contains(deleteItem)) {	1✔
243	// add (correct) word to delete's suggested correction list
244	derivedWords(deleteItem)._1 += word	1✔
245	} else {
246	// note frequency of word in corpus is not incremented
247	derivedWords(deleteItem) = (ListBuffer(word), 0L)	1✔
248	}
249	}) // End deletes.foreach
250	}
251	derivedWords
252	.filterKeys(a => derivedWords(a)._1.length >= $(deletesThreshold))	1✔
253	.mapValues(derivedWords => (derivedWords._1.toList, derivedWords._2))	1✔
254	.toMap	1✔
255	}
256
257	override def train(
258	dataset: Dataset[_],
259	recursivePipeline: Option[PipelineModel]): SymmetricDeleteModel = {
260
UNCOV 261	require(!dataset.rdd.isEmpty(), "Dataset for training is empty")	×
262
263	validateDataSet(dataset)	1✔
264
265	val possibleDict = get(dictionary).map(d => ResourceHelper.getWordCount(d))	1✔
266
267	val trainDataSet =
268	dataset
269	.select(getInputCols.head)	1✔
270	.as[Array[Annotation]]	1✔
271	.flatMap(_.map(_.result))	1✔
272
273	val wordFrequencies =
274	trainDataSet
275	.groupBy("value")
276	.count()
277	.filter(s"count(value) >= ${$(frequencyThreshold)}")
278	.as[(String, Long)]
279	.collect	1✔
280	.toList	1✔
281
282	val derivedWords =
283	derivedWordDistances(wordFrequencies, $(maxEditDistance))	1✔
284
285	val longestWordLength =
286	trainDataSet.agg(max(length(col("value")))).head().getInt(0)	1✔
287
288	val model =
289	new SymmetricDeleteModel()
290	.setDerivedWords(derivedWords)
291	.setLongestWordLength(longestWordLength)	1✔
292
293	if (possibleDict.isDefined) {	1✔
294	val min = wordFrequencies.minBy(_._2)._2	1✔
295	val max = wordFrequencies.maxBy(_._2)._2	1✔
296	model.setMinFrequency(min)	1✔
297	model.setMaxFrequency(max)	1✔
298	model.setDictionary(possibleDict.get.toMap)	1✔
299	}
300
301	model
302	}
303
304	private def validateDataSet(dataset: Dataset[_]): Unit = {
305	try {
306	dataset.select(getInputCols.head).as[Array[Annotation]]	1✔
307	} catch {
308	case exception: AnalysisException =>
309	if (exception.getMessage == "need an array field but got string;") {	1✔
310	throw new IllegalArgumentException(	×
311	"Train dataset must have an array annotation type column")
312	}
313	throw exception	1✔
314	}
315	}
316
317	}
318	// This objects reads the class' properties, it enables reading the model after it is stored
319
320	/** This is the companion object of [[SymmetricDeleteApproach]]. Please refer to that class for
321	* the documentation.
322	*/
323	object SymmetricDeleteApproach extends DefaultParamsReadable[SymmetricDeleteApproach]

JohnSnowLabs / spark-nlp / 13883000244

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous