4992350528

Build Type

github

Committed by GitHub

Commit Message

Merge 424c7ff18 into ef7906c5e

Pull Request Pull Request #13797: SPARKNLP-835: ProtectedParam and ProtectedFeature

Run Details

24 of 24 new or added lines in 6 files covered. (100.0%)

8643 of 13129 relevant lines covered (65.83%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.43

/src/main/scala/com/johnsnowlabs/nlp/annotators/NormalizerModel.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.AnnotatorType.TOKEN
import com.johnsnowlabs.nlp.serialization.MapFeature
import com.johnsnowlabs.nlp.{
  Annotation,
  AnnotatorModel,
  HasSimpleAnnotate,
  ParamsAndFeaturesReadable
}
import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

/** Instantiated Model of the [[Normalizer]]. For usage and examples, please see the documentation
  * of that class.
  *
  * @see
  *   [[Normalizer]] for the base class
  * @param uid
  *   required internal uid for saving annotator
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class NormalizerModel(override val uid: String)
    extends AnnotatorModel[NormalizerModel]
    with HasSimpleAnnotate[NormalizerModel] {

  /** Output annotator type : TOKEN
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = TOKEN

  /** Input annotator type : TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

  case class TokenizerAndNormalizerMap(
      beginTokenizer: Int,
      endTokenizer: Int,
      token: String,
      beginNormalizer: Int,
      endNormalizer: Int,
      normalizer: String)

  /** normalization regex patterns which match will be removed from token
    *
    * @group param
    */
  val cleanupPatterns = new StringArrayParam(
    this,
    "cleanupPatterns",
    "normalization regex patterns which match will be removed from token")

  /** @group setParam */
  def setCleanupPatterns(value: Array[String]): this.type = set(cleanupPatterns, value)

  /** @group setParam */
  def getCleanupPatterns: Array[String] = $(cleanupPatterns)

  /** whether to convert strings to lowercase
    *
    * @group param
    */
  val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")

  /** @group setParam */
  def setLowercase(value: Boolean): this.type = set(lowercase, value)

  /** @group setParam */
  def getLowercase: Boolean = $(lowercase)

  /** slangDict
    *
    * @group param
    */
  protected val slangDict: MapFeature[String, String] = new MapFeature(this, "slangDict")

  /** whether or not to be case sensitive to match slangs. Defaults to false.
    *
    * @group param
    */
  val slangMatchCase = new BooleanParam(
    this,
    "slangMatchCase",
    "whether or not to be case sensitive to match slangs. Defaults to false.")

  /** @group setParam */
  def setSlangMatchCase(value: Boolean): this.type = set(slangMatchCase, value)

  /** @group getParam */
  def getSlangMatchCase: Boolean = $(slangMatchCase)

  def this() = this(Identifiable.randomUID("NORMALIZER"))

  /** @group setParam */
  def setSlangDict(value: Map[String, String]): this.type = set(slangDict, value)

  /** Set the minimum allowed length for each token
    *
    * @group param
    */
  val minLength = new IntParam(this, "minLength", "Set the minimum allowed length for each token")

  /** @group setParam */
  def setMinLength(value: Int): this.type = {
    require(value >= 0, "minLength must be greater equal than 0")
    require(value.isValidInt, "minLength must be Int")
    set(minLength, value)
  }

  /** @group getParam */
  def getMinLength: Int = $(minLength)

  /** Set the maximum allowed length for each token
    *
    * @group param
    */
  val maxLength = new IntParam(this, "maxLength", "Set the maximum allowed length for each token")

  /** @group setParam */
  def setMaxLength(value: Int): this.type = {
    require(
      value >= $ {
        minLength
      },
      "maxLength must be greater equal than minLength")
    require(value.isValidInt, "minLength must be Int")
    set(maxLength, value)
  }

  /** @group getParam */
  def getMaxLength: Int = $(maxLength)

  def applyRegexPatterns(word: String): String = {

    val nToken = {
      get(cleanupPatterns)
        .map(_.foldLeft(word)((currentText, compositeToken) => {
          currentText.replaceAll(compositeToken, "")
        }))
        .getOrElse(word)
    }
    nToken
  }

  /** Txt file with delimited words to be transformed into something else
    *
    * @group getParam
    */
  protected def getSlangDict: Map[String, String] = $$(slangDict)

  /** ToDo: Review implementation, Current implementation generates spaces between non-words,
    * potentially breaking tokens
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val normalizedAnnotations = annotations.flatMap { originalToken =>
      /** slang dictionary keys should have been lowercased if slangMatchCase is false */
      val unslanged = $$(slangDict).get(
        if ($(slangMatchCase)) originalToken.result
        else originalToken.result.toLowerCase)

      /** simple-tokenize the unslanged slag phrase */
      val tokenizedUnslang = {
        unslanged
          .map(unslang => {
            unslang.split(" ")
          })
          .getOrElse(Array(originalToken.result))
      }

      val cleaned = tokenizedUnslang.map(word => applyRegexPatterns(word))

      val cased = if ($(lowercase)) cleaned.map(_.toLowerCase) else cleaned

      cased
        .filter(t =>
          t.nonEmpty && t.length >= $(minLength) && get(maxLength).forall(m => t.length <= m))
        .map { finalToken =>
          {
            Annotation(
              outputAnnotatorType,
              originalToken.begin,
              originalToken.begin + finalToken.length - 1,
              finalToken,
              originalToken.metadata)
          }
        }

    }

    normalizedAnnotations

  }

}

object NormalizerModel extends ParamsAndFeaturesReadable[NormalizerModel]

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators
18
19	import com.johnsnowlabs.nlp.AnnotatorType.TOKEN
20	import com.johnsnowlabs.nlp.serialization.MapFeature
21	import com.johnsnowlabs.nlp.{
22	Annotation,
23	AnnotatorModel,
24	HasSimpleAnnotate,
25	ParamsAndFeaturesReadable
26	}
27	import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
28	import org.apache.spark.ml.util.Identifiable
29
30	/** Instantiated Model of the [[Normalizer]]. For usage and examples, please see the documentation
31	* of that class.
32	*
33	* @see
34	* [[Normalizer]] for the base class
35	* @param uid
36	* required internal uid for saving annotator
37	* @groupname anno Annotator types
38	* @groupdesc anno
39	* Required input and expected output annotator types
40	* @groupname Ungrouped Members
41	* @groupname param Parameters
42	* @groupname setParam Parameter setters
43	* @groupname getParam Parameter getters
44	* @groupname Ungrouped Members
45	* @groupprio param 1
46	* @groupprio anno 2
47	* @groupprio Ungrouped 3
48	* @groupprio setParam 4
49	* @groupprio getParam 5
50	* @groupdesc param
51	* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
52	* parameter values through setters and getters, respectively.
53	*/
54	class NormalizerModel(override val uid: String)
55	extends AnnotatorModel[NormalizerModel]
56	with HasSimpleAnnotate[NormalizerModel] {
57
58	/** Output annotator type : TOKEN
59	*
60	* @group anno
61	*/
62	override val outputAnnotatorType: AnnotatorType = TOKEN	1✔
63
64	/** Input annotator type : TOKEN
65	*
66	* @group anno
67	*/
68	override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)	1✔
69
70	case class TokenizerAndNormalizerMap(
71	beginTokenizer: Int,
72	endTokenizer: Int,
73	token: String,
74	beginNormalizer: Int,
75	endNormalizer: Int,
76	normalizer: String)
77
78	/** normalization regex patterns which match will be removed from token
79	*
80	* @group param
81	*/
82	val cleanupPatterns = new StringArrayParam(	1✔
83	this,
84	"cleanupPatterns",	1✔
85	"normalization regex patterns which match will be removed from token")	1✔
86
87	/** @group setParam */
88	def setCleanupPatterns(value: Array[String]): this.type = set(cleanupPatterns, value)	1✔
89
90	/** @group setParam */
91	def getCleanupPatterns: Array[String] = $(cleanupPatterns)	×
92
93	/** whether to convert strings to lowercase
94	*
95	* @group param
96	*/
97	val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")	1✔
98
99	/** @group setParam */
100	def setLowercase(value: Boolean): this.type = set(lowercase, value)	1✔
101
102	/** @group setParam */
103	def getLowercase: Boolean = $(lowercase)	×
104
105	/** slangDict
106	*
107	* @group param
108	*/
109	protected val slangDict: MapFeature[String, String] = new MapFeature(this, "slangDict")	1✔
110
111	/** whether or not to be case sensitive to match slangs. Defaults to false.
112	*
113	* @group param
114	*/
115	val slangMatchCase = new BooleanParam(	1✔
116	this,
117	"slangMatchCase",	1✔
118	"whether or not to be case sensitive to match slangs. Defaults to false.")	1✔
119
120	/** @group setParam */
121	def setSlangMatchCase(value: Boolean): this.type = set(slangMatchCase, value)	1✔
122
123	/** @group getParam */
124	def getSlangMatchCase: Boolean = $(slangMatchCase)	×
125
126	def this() = this(Identifiable.randomUID("NORMALIZER"))	1✔
127
128	/** @group setParam */
129	def setSlangDict(value: Map[String, String]): this.type = set(slangDict, value)	1✔
130
131	/** Set the minimum allowed length for each token
132	*
133	* @group param
134	*/
135	val minLength = new IntParam(this, "minLength", "Set the minimum allowed length for each token")	1✔
136
137	/** @group setParam */
138	def setMinLength(value: Int): this.type = {
139	require(value >= 0, "minLength must be greater equal than 0")	×
140	require(value.isValidInt, "minLength must be Int")	1✔
141	set(minLength, value)	1✔
142	}
143
144	/** @group getParam */
145	def getMinLength: Int = $(minLength)	×
146
147	/** Set the maximum allowed length for each token
148	*
149	* @group param
150	*/
151	val maxLength = new IntParam(this, "maxLength", "Set the maximum allowed length for each token")	1✔
152
153	/** @group setParam */
154	def setMaxLength(value: Int): this.type = {
155	require(	×
156	value >= $ {	×
157	minLength	×
158	},
159	"maxLength must be greater equal than minLength")	×
160	require(value.isValidInt, "minLength must be Int")	×
161	set(maxLength, value)	×
162	}
163
164	/** @group getParam */
165	def getMaxLength: Int = $(maxLength)	×
166
167	def applyRegexPatterns(word: String): String = {
168
169	val nToken = {
170	get(cleanupPatterns)
171	.map(_.foldLeft(word)((currentText, compositeToken) => {
172	currentText.replaceAll(compositeToken, "")
173	}))
174	.getOrElse(word)	1✔
175	}
176	nToken
177	}
178
179	/** Txt file with delimited words to be transformed into something else
180	*
181	* @group getParam
182	*/
183	protected def getSlangDict: Map[String, String] = $$(slangDict)	×
184
185	/** ToDo: Review implementation, Current implementation generates spaces between non-words,
186	* potentially breaking tokens
187	*/
188	override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
189	val normalizedAnnotations = annotations.flatMap { originalToken =>	1✔
190	/** slang dictionary keys should have been lowercased if slangMatchCase is false */
191	val unslanged = $$(slangDict).get(	1✔
192	if ($(slangMatchCase)) originalToken.result	×
193	else originalToken.result.toLowerCase)	1✔
194
195	/** simple-tokenize the unslanged slag phrase */
196	val tokenizedUnslang = {
197	unslanged
198	.map(unslang => {
199	unslang.split(" ")	1✔
200	})
201	.getOrElse(Array(originalToken.result))	1✔
202	}
203
204	val cleaned = tokenizedUnslang.map(word => applyRegexPatterns(word))	1✔
205
206	val cased = if ($(lowercase)) cleaned.map(_.toLowerCase) else cleaned	1✔
207
208	cased
209	.filter(t =>	1✔
210	t.nonEmpty && t.length >= $(minLength) && get(maxLength).forall(m => t.length <= m))	1✔
211	.map { finalToken =>	1✔
212	{
213	Annotation(	1✔
214	outputAnnotatorType,	1✔
215	originalToken.begin,	1✔
216	originalToken.begin + finalToken.length - 1,	1✔
217	finalToken,
218	originalToken.metadata)	1✔
219	}
220	}
221
222	}
223
224	normalizedAnnotations
225
226	}
227
228	}
229
230	object NormalizerModel extends ParamsAndFeaturesReadable[NormalizerModel]

JohnSnowLabs / spark-nlp / 4992350528

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous