• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4992350528

pending completion
4992350528

Pull #13797

github

GitHub
Merge 424c7ff18 into ef7906c5e
Pull Request #13797: SPARKNLP-835: ProtectedParam and ProtectedFeature

24 of 24 new or added lines in 6 files covered. (100.0%)

8643 of 13129 relevant lines covered (65.83%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.43
/src/main/scala/com/johnsnowlabs/nlp/annotators/NormalizerModel.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators
18

19
import com.johnsnowlabs.nlp.AnnotatorType.TOKEN
20
import com.johnsnowlabs.nlp.serialization.MapFeature
21
import com.johnsnowlabs.nlp.{
22
  Annotation,
23
  AnnotatorModel,
24
  HasSimpleAnnotate,
25
  ParamsAndFeaturesReadable
26
}
27
import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
28
import org.apache.spark.ml.util.Identifiable
29

30
/** Instantiated Model of the [[Normalizer]]. For usage and examples, please see the documentation
31
  * of that class.
32
  *
33
  * @see
34
  *   [[Normalizer]] for the base class
35
  * @param uid
36
  *   required internal uid for saving annotator
37
  * @groupname anno Annotator types
38
  * @groupdesc anno
39
  *   Required input and expected output annotator types
40
  * @groupname Ungrouped Members
41
  * @groupname param Parameters
42
  * @groupname setParam Parameter setters
43
  * @groupname getParam Parameter getters
44
  * @groupname Ungrouped Members
45
  * @groupprio param  1
46
  * @groupprio anno  2
47
  * @groupprio Ungrouped 3
48
  * @groupprio setParam  4
49
  * @groupprio getParam  5
50
  * @groupdesc param
51
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
52
  *   parameter values through setters and getters, respectively.
53
  */
54
class NormalizerModel(override val uid: String)
55
    extends AnnotatorModel[NormalizerModel]
56
    with HasSimpleAnnotate[NormalizerModel] {
57

58
  /** Output annotator type : TOKEN
59
    *
60
    * @group anno
61
    */
62
  override val outputAnnotatorType: AnnotatorType = TOKEN
1✔
63

64
  /** Input annotator type : TOKEN
65
    *
66
    * @group anno
67
    */
68
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)
1✔
69

70
  case class TokenizerAndNormalizerMap(
71
      beginTokenizer: Int,
72
      endTokenizer: Int,
73
      token: String,
74
      beginNormalizer: Int,
75
      endNormalizer: Int,
76
      normalizer: String)
77

78
  /** normalization regex patterns which match will be removed from token
79
    *
80
    * @group param
81
    */
82
  val cleanupPatterns = new StringArrayParam(
1✔
83
    this,
84
    "cleanupPatterns",
1✔
85
    "normalization regex patterns which match will be removed from token")
1✔
86

87
  /** @group setParam */
88
  def setCleanupPatterns(value: Array[String]): this.type = set(cleanupPatterns, value)
1✔
89

90
  /** @group setParam */
91
  def getCleanupPatterns: Array[String] = $(cleanupPatterns)
×
92

93
  /** whether to convert strings to lowercase
94
    *
95
    * @group param
96
    */
97
  val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")
1✔
98

99
  /** @group setParam */
100
  def setLowercase(value: Boolean): this.type = set(lowercase, value)
1✔
101

102
  /** @group setParam */
103
  def getLowercase: Boolean = $(lowercase)
×
104

105
  /** slangDict
106
    *
107
    * @group param
108
    */
109
  protected val slangDict: MapFeature[String, String] = new MapFeature(this, "slangDict")
1✔
110

111
  /** whether or not to be case sensitive to match slangs. Defaults to false.
112
    *
113
    * @group param
114
    */
115
  val slangMatchCase = new BooleanParam(
1✔
116
    this,
117
    "slangMatchCase",
1✔
118
    "whether or not to be case sensitive to match slangs. Defaults to false.")
1✔
119

120
  /** @group setParam */
121
  def setSlangMatchCase(value: Boolean): this.type = set(slangMatchCase, value)
1✔
122

123
  /** @group getParam */
124
  def getSlangMatchCase: Boolean = $(slangMatchCase)
×
125

126
  def this() = this(Identifiable.randomUID("NORMALIZER"))
1✔
127

128
  /** @group setParam */
129
  def setSlangDict(value: Map[String, String]): this.type = set(slangDict, value)
1✔
130

131
  /** Set the minimum allowed length for each token
132
    *
133
    * @group param
134
    */
135
  val minLength = new IntParam(this, "minLength", "Set the minimum allowed length for each token")
1✔
136

137
  /** @group setParam */
138
  def setMinLength(value: Int): this.type = {
139
    require(value >= 0, "minLength must be greater equal than 0")
×
140
    require(value.isValidInt, "minLength must be Int")
1✔
141
    set(minLength, value)
1✔
142
  }
143

144
  /** @group getParam */
145
  def getMinLength: Int = $(minLength)
×
146

147
  /** Set the maximum allowed length for each token
148
    *
149
    * @group param
150
    */
151
  val maxLength = new IntParam(this, "maxLength", "Set the maximum allowed length for each token")
1✔
152

153
  /** @group setParam */
154
  def setMaxLength(value: Int): this.type = {
155
    require(
×
156
      value >= $ {
×
157
        minLength
×
158
      },
159
      "maxLength must be greater equal than minLength")
×
160
    require(value.isValidInt, "minLength must be Int")
×
161
    set(maxLength, value)
×
162
  }
163

164
  /** @group getParam */
165
  def getMaxLength: Int = $(maxLength)
×
166

167
  def applyRegexPatterns(word: String): String = {
168

169
    val nToken = {
170
      get(cleanupPatterns)
171
        .map(_.foldLeft(word)((currentText, compositeToken) => {
172
          currentText.replaceAll(compositeToken, "")
173
        }))
174
        .getOrElse(word)
1✔
175
    }
176
    nToken
177
  }
178

179
  /** Txt file with delimited words to be transformed into something else
180
    *
181
    * @group getParam
182
    */
183
  protected def getSlangDict: Map[String, String] = $$(slangDict)
×
184

185
  /** ToDo: Review implementation, Current implementation generates spaces between non-words,
186
    * potentially breaking tokens
187
    */
188
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
189
    val normalizedAnnotations = annotations.flatMap { originalToken =>
1✔
190
      /** slang dictionary keys should have been lowercased if slangMatchCase is false */
191
      val unslanged = $$(slangDict).get(
1✔
192
        if ($(slangMatchCase)) originalToken.result
×
193
        else originalToken.result.toLowerCase)
1✔
194

195
      /** simple-tokenize the unslanged slag phrase */
196
      val tokenizedUnslang = {
197
        unslanged
198
          .map(unslang => {
199
            unslang.split(" ")
1✔
200
          })
201
          .getOrElse(Array(originalToken.result))
1✔
202
      }
203

204
      val cleaned = tokenizedUnslang.map(word => applyRegexPatterns(word))
1✔
205

206
      val cased = if ($(lowercase)) cleaned.map(_.toLowerCase) else cleaned
1✔
207

208
      cased
209
        .filter(t =>
1✔
210
          t.nonEmpty && t.length >= $(minLength) && get(maxLength).forall(m => t.length <= m))
1✔
211
        .map { finalToken =>
1✔
212
          {
213
            Annotation(
1✔
214
              outputAnnotatorType,
1✔
215
              originalToken.begin,
1✔
216
              originalToken.begin + finalToken.length - 1,
1✔
217
              finalToken,
218
              originalToken.metadata)
1✔
219
          }
220
        }
221

222
    }
223

224
    normalizedAnnotations
225

226
  }
227

228
}
229

230
object NormalizerModel extends ParamsAndFeaturesReadable[NormalizerModel]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc