• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 18685790193

21 Oct 2025 01:39PM UTC coverage: 55.216%. First build
18685790193

Pull #14676

github

web-flow
Merge 427de3761 into b827818c7
Pull Request #14676: Spark NLP 6.2.0 Release

147 of 185 new or added lines in 7 files covered. (79.46%)

11924 of 21595 relevant lines covered (55.22%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

77.78
/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentNormalizer.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators
18

19
import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
20
import com.johnsnowlabs.nlp.annotators.cleaners.util.CleanerHelper
21
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate}
22
import org.apache.spark.ml.param.{BooleanParam, Param, StringArrayParam}
23
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
24
import org.slf4j.{Logger, LoggerFactory}
25

26
import java.nio.charset.{Charset, StandardCharsets}
27
import java.util.Locale
28
import scala.collection.mutable.ListBuffer
29
import scala.util.matching.Regex
30
import scala.util.{Failure, Success, Try}
31
import scala.xml.XML
32

33
/** Annotator which normalizes raw text from tagged text, e.g. scraped web pages or xml documents,
34
  * from document type columns into Sentence. Removes all dirty characters from text following one
35
  * or more input regex patterns. Can apply not wanted character removal with a specific policy.
36
  * Can apply lower case normalization.
37
  *
38
  * For extended examples of usage, see the
39
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb Examples]].
40
  *
41
  * ==Example==
42
  * {{{
43
  * import spark.implicits._
44
  * import com.johnsnowlabs.nlp.DocumentAssembler
45
  * import com.johnsnowlabs.nlp.annotator.DocumentNormalizer
46
  * import org.apache.spark.ml.Pipeline
47
  *
48
  * val documentAssembler = new DocumentAssembler()
49
  *   .setInputCol("text")
50
  *   .setOutputCol("document")
51
  *
52
  * val cleanUpPatterns = Array("<[^>]*>")
53
  *
54
  * val documentNormalizer = new DocumentNormalizer()
55
  *   .setInputCols("document")
56
  *   .setOutputCol("normalizedDocument")
57
  *   .setAction("clean")
58
  *   .setPatterns(cleanUpPatterns)
59
  *   .setReplacement(" ")
60
  *   .setPolicy("pretty_all")
61
  *   .setLowercase(true)
62
  *
63
  * val pipeline = new Pipeline().setStages(Array(
64
  *   documentAssembler,
65
  *   documentNormalizer
66
  * ))
67
  *
68
  * val text =
69
  *   """
70
  * <div id="theworldsgreatest" class='my-right my-hide-small my-wide toptext' style="font-family:'Segoe UI',Arial,sans-serif">
71
  *   THE WORLD'S LARGEST WEB DEVELOPER SITE
72
  *   <h1 style="font-size:300%;">THE WORLD'S LARGEST WEB DEVELOPER SITE</h1>
73
  *   <p style="font-size:160%;">Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..</p>
74
  * </div>
75
  *
76
  * </div>"""
77
  * val data = Seq(text).toDF("text")
78
  * val pipelineModel = pipeline.fit(data)
79
  *
80
  * val result = pipelineModel.transform(data)
81
  * result.selectExpr("normalizedDocument.result").show(truncate=false)
82
  * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
83
  * |result                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
84
  * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
85
  * |[ the world's largest web developer site the world's largest web developer site lorem ipsum is simply dummy text of the printing and typesetting industry. lorem ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. it has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. it was popularised in the 1960s with the release of letraset sheets containing lorem ipsum passages, and more recently with desktop publishing software like aldus pagemaker including versions of lorem ipsum..]|
86
  * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
87
  * }}}
88
  * @param uid
89
  *   required uid for storing annotator to disk
90
  * @groupname anno Annotator types
91
  * @groupdesc anno
92
  *   Required input and expected output annotator types
93
  * @groupname Ungrouped Members
94
  * @groupname param Parameters
95
  * @groupname setParam Parameter setters
96
  * @groupname getParam Parameter getters
97
  * @groupname Ungrouped Members
98
  * @groupprio param  1
99
  * @groupprio anno  2
100
  * @groupprio Ungrouped 3
101
  * @groupprio setParam  4
102
  * @groupprio getParam  5
103
  * @groupdesc param
104
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
105
  *   parameter values through setters and getters, respectively.
106
  */
107
class DocumentNormalizer(override val uid: String)
108
    extends AnnotatorModel[DocumentNormalizer]
109
    with HasSimpleAnnotate[DocumentNormalizer] {
110

111
  private val logger: Logger = LoggerFactory.getLogger(this.getClass)
1✔
112

113
  private val EMPTY_STR = ""
1✔
114
  private val BREAK_STR = "|##|"
1✔
115
  private val SPACE_STR = " "
1✔
116
  private val GENERIC_TAGS_REMOVAL_PATTERN = "<[^>]*>"
1✔
117

118
  /** Input annotator type : DOCUMENT
119
    *
120
    * @group anno
121
    */
122
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array[AnnotatorType](DOCUMENT)
1✔
123

124
  /** Output annotator type : DOCUMENT
125
    *
126
    * @group anno
127
    */
128
  override val outputAnnotatorType: AnnotatorType = DOCUMENT
1✔
129

130
  def this() = this(Identifiable.randomUID("DOCUMENT_NORMALIZER"))
1✔
131

132
  /** Action to perform applying regex patterns on text
133
    *
134
    * @group param
135
    */
136
  val action: Param[String] =
137
    new Param(this, "action", "Action to perform applying regex patterns on text")
1✔
138

139
  /** Normalization regex patterns which match will be removed from document (Default:
140
    * `Array("<[^>]*>")`)
141
    *
142
    * @group param
143
    */
144
  val patterns: StringArrayParam = new StringArrayParam(
1✔
145
    this,
146
    "patterns",
1✔
147
    "Normalization regex patterns which match will be removed from document. Defaults is \"<[^>]*>\"")
1✔
148

149
  /** Replacement string to apply when regexes match (Default: `" "`)
150
    *
151
    * @group param
152
    */
153
  val replacement: Param[String] =
154
    new Param(this, "replacement", "Replacement string to apply when regexes match")
1✔
155

156
  /** Whether to convert strings to lowercase (Default: `false`)
157
    *
158
    * @group param
159
    */
160
  val lowercase = new BooleanParam(
1✔
161
    this,
162
    "lowercase",
1✔
163
    "Whether to convert strings to lowercase (Default: `false`)")
1✔
164

165
  /** RemovalPolicy to remove patterns from text with a given policy (Default: `"pretty_all"`).
166
    * Possible values are `"all", "pretty_all", "first", "pretty_first"`
167
    * @group param
168
    */
169
  val policy: Param[String] =
170
    new Param(this, "policy", "RemovalPolicy to remove pattern from text")
1✔
171

172
  /** File encoding to apply on normalized documents (Default: `"disable"`)
173
    *
174
    * @group param
175
    */
176
  val encoding: Param[String] = new Param(
1✔
177
    this,
178
    name = "encoding",
1✔
179
    "File encoding to apply on normalized documents (Default: `disable`)")
1✔
180

181
  val presetPattern = new Param[String](
1✔
182
    this,
183
    "presetPattern",
1✔
184
    "Single functional cleaner preset (CLEAN_BULLETS, CLEAN_DASHES, etc.)")
1✔
185

186
  val autoMode = new Param[String](
1✔
187
    this,
188
    "autoMode",
1✔
189
    "Automatic cleaning mode grouping multiple cleaners: light_clean, document_clean, social_clean, html_clean, full_auto")
1✔
190

191
  //  Assuming non-html does not contain any < or > and that input string is correctly structured
192
  setDefault(
1✔
193
    inputCols -> Array(AnnotatorType.DOCUMENT),
1✔
194
    action -> "clean",
1✔
195
    patterns -> Array(GENERIC_TAGS_REMOVAL_PATTERN),
1✔
196
    replacement -> SPACE_STR,
1✔
197
    lowercase -> false,
1✔
198
    policy -> "pretty_all",
1✔
199
    encoding -> "disable",
1✔
200
    autoMode -> "")
1✔
201

202
  /** Action to perform on text. (Default `"clean"`).
203
    *
204
    * @group getParam
205
    */
206
  def getAction: String = $(action)
1✔
207

208
  /** Regular expressions list for normalization.
209
    *
210
    * @group getParam
211
    */
212
  def getPatterns: Array[String] = $(patterns)
1✔
213

214
  /** Replacement string to apply when regexes match (Default: `" "`)
215
    *
216
    * @group getParam
217
    */
218
  def getReplacement: String = $(replacement)
1✔
219

220
  /** Lowercase tokens (Default: `false`)
221
    *
222
    * @group getParam
223
    */
224
  def getLowercase: Boolean = $(lowercase)
1✔
225

226
  /** Policy to remove patterns from text (Default: `"pretty_all"`)
227
    *
228
    * @group getParam
229
    */
230
  def getPolicy: String = $(policy)
1✔
231

232
  /** Encoding to apply to normalized documents (Default: `"disable"`)
233
    *
234
    * @group getParam
235
    */
236
  def getEncoding: String = $(encoding)
1✔
237

238
  /** Action to perform on text. (Default `"clean"`).
239
    *
240
    * @group getParam
241
    */
242
  def setAction(value: String): this.type = set(action, value)
1✔
243

244
  /** Regular expressions list for normalization (Default: `Array("<[^>]*>")`)
245
    *
246
    * @group setParam
247
    */
248
  def setPatterns(value: Array[String]): this.type = set(patterns, value)
1✔
249

250
  /** Replacement string to apply when regexes match (Default: `" "`)
251
    *
252
    * @group getParam
253
    */
254
  def setReplacement(value: String): this.type = set(replacement, value)
1✔
255

256
  /** Lower case tokens default false
257
    *
258
    * @group setParam
259
    */
260
  def setLowercase(value: Boolean): this.type = set(lowercase, value)
1✔
261

262
  /** Removal policy to apply (Default: `"pretty_all"`). Valid policy values are: "all",
263
    * "pretty_all", "first", "pretty_first"
264
    *
265
    * @group setParam
266
    */
267
  def setPolicy(value: String): this.type = set(policy, value)
1✔
268

269
  /** Encoding to apply. Default is `"UTF-8"`. Valid encoding are values are: UTF_8, UTF_16,
270
    * US_ASCII, ISO-8859-1, UTF-16BE, UTF-16LE
271
    *
272
    * @group setParam
273
    */
274
  def setEncoding(value: String): this.type = set(encoding, value)
1✔
275

NEW
276
  def setPresetPattern(value: String): this.type = set(presetPattern, value)
×
277

278
  def setAutoMode(value: String): this.type = set(autoMode, value)
1✔
279

280
  /** Applying document normalization without pretty formatting (removing multiple spaces) */
281
  private def withAllFormatter(
282
      text: String,
283
      action: String,
284
      patterns: Array[String],
285
      replacement: String): String = {
286
    action match {
287
      case "clean" =>
288
        val patternsStr: String = patterns.mkString(BREAK_STR)
1✔
289
        text.replaceAll(patternsStr, replacement)
1✔
290
      case "extract" =>
291
        val htmlXml = XML.loadString(text)
1✔
292
        val textareaContents = (htmlXml \\ patterns.mkString).text
1✔
293
        textareaContents
294
      case "lookaround" =>
295
        LookAroundManager.process(text, patterns, replacement)
1✔
296
      case _ =>
297
        throw new Exception(
×
298
          "Unknown action parameter in DocumentNormalizer annotation." +
299
            "Please select either: clean or extract")
300
    }
301
  }
302

303
  /** pattern to grab from text as token candidates. Defaults \\S+ */
304
  private def withPrettyAllFormatter(
305
      text: String,
306
      action: String,
307
      patterns: Array[String],
308
      replacement: String): String = {
309
    withAllFormatter(text, action, patterns, replacement)
310
      .split("\\s+")
1✔
311
      .map(_.trim)
1✔
312
      .mkString(SPACE_STR)
1✔
313
  }
314

315
  /** Applying document normalization without pretty formatting (removing multiple spaces)
316
    * retrieving first element only
317
    */
318
  private def withFirstFormatter(
319
      text: String,
320
      action: String,
321
      patterns: Array[String],
322
      replacement: String): String = {
323
    action match {
324
      case "clean" =>
325
        val patternsStr: String = patterns.mkString(BREAK_STR)
×
326
        text.replaceFirst(patternsStr, replacement)
×
327
      case "extract" =>
328
        val htmlXml = XML.loadString(text)
×
329
        val textareaContents = htmlXml \\ patterns.mkString
×
330
        textareaContents.head.mkString
×
331
      case "lookaround" =>
332
        LookAroundManager.process(text, patterns, replacement)
×
333
      case _ =>
334
        throw new Exception(
×
335
          "Unknown action parameter in DocumentNormalizer annotation." +
336
            "Please select either: clean or extract")
337
    }
338
  }
339

340
  /** pattern to grab from text as token candidates. Defaults \\S+ */
341
  private def withPrettyFirstFormatter(
342
      text: String,
343
      action: String,
344
      patterns: Array[String],
345
      replacement: String): String = {
346
    withFirstFormatter(text, action, patterns, replacement)
347
      .split("\\s+")
×
348
      .map(_.trim)
×
349
      .mkString(SPACE_STR)
×
350
  }
351

352
  /** Apply a given encoding to the processed text
353
    *
354
    * US-ASCII Seven-bit ASCII, a.k.a. ISO646-US, a.k.a. the Basic Latin block of the Unicode
355
    * character set
356
    *
357
    * ISO-8859-1 ISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1
358
    *
359
    * UTF-8 Eight-bit UCS Transformation Format
360
    *
361
    * UTF-16BE Sixteen-bit UCS Transformation Format, big-endian byte order
362
    *
363
    * UTF-16LE Sixteen-bit UCS Transformation Format, little-endian byte order
364
    *
365
    * UTF-16 Sixteen-bit UCS Transformation Format, byte order identified by an optional
366
    * byte-order mark
367
    */
368
  private def withEncoding(text: String, encoding: Charset = StandardCharsets.UTF_8): String = {
369
    val defaultCharset: Charset = Charset.defaultCharset
1✔
370
    if (!Charset.defaultCharset.equals(encoding)) {
1✔
371
      log.warn("Requested encoding parameter is different from the default charset.")
×
372
    }
373
    new String(text.getBytes(defaultCharset), encoding)
1✔
374
  }
375

376
  /** Applies document normalization:
377
    *   1. User-defined regex patterns (if any) 2. Either functional preset (CleanerHelper) OR
378
    *      autoMode (CleanerHelper bundles) 3. Lowercasing and encoding policy
379
    */
380
  private def applyDocumentNormalization(
381
      text: String,
382
      action: String,
383
      patterns: Array[String],
384
      replacement: String,
385
      policy: String,
386
      lowercase: Boolean,
387
      encoding: String): String = {
388

389
    val normAutoMode = $(autoMode).toLowerCase(Locale.ROOT)
1✔
NEW
390
    val hasPreset = isDefined(presetPattern) && FUNCTIONAL_PRESETS.contains($(presetPattern))
×
391
    val hasAutoMode = isDefined(autoMode) && AUTO_MODE_FUNCTIONS.contains(normAutoMode)
1✔
392

393
    val selectedCleaner: Either[String => String, Seq[String => String]] =
394
      (hasPreset, hasAutoMode) match {
395
        case (true, true) =>
NEW
396
          logger.warn(
×
NEW
397
            s"[DocumentNormalizer] Both presetPattern (${$(presetPattern)}) and autoMode (${normAutoMode}) are set. " +
×
NEW
398
              s"autoMode will take precedence.")
×
NEW
399
          Right(AUTO_MODE_FUNCTIONS(normAutoMode))
×
400
        case (true, false) =>
NEW
401
          Left(FUNCTIONAL_PRESETS($(presetPattern)))
×
402
        case (false, true) =>
403
          Right(AUTO_MODE_FUNCTIONS(normAutoMode))
1✔
404
        case _ =>
405
          Right(Seq.empty)
1✔
406
      }
407

NEW
408
    val userPatterns = Option(patterns).getOrElse(Array.empty[String])
×
409

410
    val regexCleanedText: String = if (userPatterns.nonEmpty) {
1✔
411
      policy match {
1✔
NEW
412
        case "all" => withAllFormatter(text, action, userPatterns, replacement)
×
413
        case "pretty_all" => withPrettyAllFormatter(text, action, userPatterns, replacement)
1✔
NEW
414
        case "first" => withFirstFormatter(text, action, userPatterns, replacement)
×
NEW
415
        case "pretty_first" => withPrettyFirstFormatter(text, action, userPatterns, replacement)
×
416
        case _ =>
NEW
417
          throw new Exception(
×
418
            "Unknown policy parameter in DocumentNormalizer. " +
419
              "Valid options: all, pretty_all, first, pretty_first.")
420
      }
NEW
421
    } else text
×
422

423
    val cleanedText: String = selectedCleaner match {
424
      case Left(fn) =>
NEW
425
        logger.info(s"[DocumentNormalizer] Applying preset cleaner: ${$(presetPattern)}")
×
NEW
426
        fn(regexCleanedText)
×
427

428
      case Right(funcs) if funcs.nonEmpty =>
1✔
429
        val modeName = normAutoMode
430
        val functionNames = funcs.flatMap(FUNCTION_NAME_LOOKUP.get)
1✔
431
        logger.info(
1✔
432
          s"[DocumentNormalizer] AutoMode '$modeName' active. Applying cleaners in order: " +
1✔
433
            functionNames.mkString(", "))
1✔
434
        funcs.foldLeft(regexCleanedText) { (acc, cleanerFn) => cleanerFn(acc) }
1✔
435

436
      case _ =>
437
        regexCleanedText
438
    }
439

440
    val casedText = if (lowercase) cleanedText.toLowerCase else cleanedText
1✔
441

442
    encoding match {
443
      case "disable" => casedText
444
      case "UTF-8" => withEncoding(casedText, StandardCharsets.UTF_8)
1✔
NEW
445
      case "UTF-16" => withEncoding(casedText, StandardCharsets.UTF_16)
×
NEW
446
      case "US-ASCII" => withEncoding(casedText, StandardCharsets.US_ASCII)
×
NEW
447
      case "ISO-8859-1" => withEncoding(casedText, StandardCharsets.ISO_8859_1)
×
NEW
448
      case "UTF-16BE" => withEncoding(casedText, StandardCharsets.UTF_16BE)
×
NEW
449
      case "UTF-16LE" => withEncoding(casedText, StandardCharsets.UTF_16LE)
×
450
      case other =>
NEW
451
        throw new Exception(s"Unknown encoding parameter: $other. " +
×
452
          "Please select one of disable, UTF-8, UTF-16, US-ASCII, ISO-8859-1, UTF-16BE, UTF-16LE.")
453
    }
454
  }
455

456
  private lazy val FUNCTIONAL_PRESETS: Map[String, String => String] = Map(
457
    "CLEAN_BULLETS" -> CleanerHelper.cleanBullets,
458
    "CLEAN_ORDERED_BULLETS" -> CleanerHelper.cleanOrderedBullets,
459
    "CLEAN_DASHES" -> CleanerHelper.cleanDashes,
460
    "CLEAN_TRAILING_PUNCTUATION" -> CleanerHelper.cleanTrailingPunctuation,
461
    "CLEAN_EXTRA_WHITESPACE" -> CleanerHelper.cleanExtraWhitespace,
462
    "REMOVE_PUNCTUATION" -> CleanerHelper.removePunctuation,
463
    "CLEAN_NON_ASCII" -> CleanerHelper.cleanNonAsciiChars,
464
    "REPLACE_UNICODE" -> CleanerHelper.replaceUnicodeCharacters)
465

466
  private lazy val AUTO_MODE_FUNCTIONS: Map[String, Seq[String => String]] = Map(
467
    "light_clean" -> Seq(
468
      CleanerHelper.cleanExtraWhitespace,
469
      CleanerHelper.cleanTrailingPunctuation),
470
    "document_clean" -> Seq(
471
      CleanerHelper.cleanBullets,
472
      CleanerHelper.cleanOrderedBullets,
473
      CleanerHelper.cleanDashes,
474
      CleanerHelper.cleanExtraWhitespace),
475
    "social_clean" -> Seq(
476
      CleanerHelper.removePunctuation,
477
      CleanerHelper.cleanDashes,
478
      CleanerHelper.cleanExtraWhitespace),
479
    "html_clean" -> Seq(
480
      CleanerHelper.replaceUnicodeCharacters,
481
      CleanerHelper.cleanNonAsciiChars,
482
      CleanerHelper.decodeHtmlEntities),
483
    "full_auto" -> FUNCTIONAL_PRESETS.values.toSeq)
484

485
  // Reverse-map functions to preset names (for logging)
486
  private lazy val FUNCTION_NAME_LOOKUP: Map[String => String, String] =
487
    FUNCTIONAL_PRESETS.map(_.swap)
488

489
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = annotations.map {
1✔
490
    annotation =>
491
      Try(
1✔
492
        applyDocumentNormalization(
1✔
493
          annotation.result,
1✔
494
          getAction,
1✔
495
          getPatterns,
1✔
496
          getReplacement,
1✔
497
          getPolicy,
1✔
498
          getLowercase,
1✔
499
          getEncoding)) match {
1✔
500
        case Success(cleanedDoc) =>
501
          Annotation(
1✔
502
            DOCUMENT,
1✔
503
            annotation.begin,
1✔
504
            cleanedDoc.length - 1,
1✔
505
            cleanedDoc,
506
            annotation.metadata)
1✔
507
        case Failure(_) =>
508
          Annotation.apply("")
1✔
509
      }
510
  }
511
}
512

513
/** This is the companion object of [[DocumentNormalizer]]. Please refer to that class for the
514
  * documentation.
515
  */
516
object DocumentNormalizer extends DefaultParamsReadable[DocumentNormalizer]
517

518
object LookAroundManager {
519

520
  val LOOKAHEAD_PATTERN = "(?="
1✔
521
  val LOOKBEHIND_PATTERN = "(?<="
1✔
522

523
  val SEMI_COLON = "\\;"
1✔
524
  val FULL_STOP = "\\.(?!\\d+)"
1✔
525
  val EXCLAMATION_MARK = "\\!"
1✔
526
  val QUESTION_MARK = "\\?"
1✔
527
  val END_FULL_STOPS_REGEX = "\\.$"
1✔
528
  val EMPTY_STR = ""
1✔
529
  val OR_STR = "|"
1✔
530

531
  def withReplacement(text: String, replacement: String, m: Regex.Match, groupIdx: Int = 1) = { // implicit condition of picking the
532
    // assuming first group to be the lookaround pattern replacement
533
    text.replace(m.group(groupIdx), replacement)
1✔
534
  }
535

536
  def process(text: String, patterns: Array[String], replacement: String): String = {
537
    // assuming first pattern to be a lookaround containing first group as replacement target
538
    val lookaheadPattern: String = patterns.head
1✔
539
    require(
1✔
540
      lookaheadPattern.contains(LOOKAHEAD_PATTERN) || lookaheadPattern.contains(
1✔
541
        LOOKBEHIND_PATTERN),
1✔
542
      "First pattern with action lookaround must contain a lookaround symbol, i.e. (?=criteria) or (?<=criteria)")
×
543

544
    val fullStopsTrimmed = text.replaceAll(END_FULL_STOPS_REGEX, EMPTY_STR)
1✔
545
    val separators = Array(SEMI_COLON, FULL_STOP, EXCLAMATION_MARK, QUESTION_MARK)
1✔
546

547
    val detectedSeps =
548
      for (s <- separators; if text.contains(s.replace("\\", ""))) yield s.replace("\\", "")
1✔
549

550
    val chunks =
551
      if (!detectedSeps.isEmpty)
1✔
552
        fullStopsTrimmed.split(detectedSeps.mkString(OR_STR))
1✔
553
      else
554
        Array(fullStopsTrimmed)
1✔
555

556
    val lookaheadRegex: Regex = lookaheadPattern.r
1✔
557

558
    val replacedChunks = new ListBuffer[String]()
1✔
559

560
    for (c <- chunks) {
1✔
561
      val res = lookaheadRegex.findFirstMatchIn(c) match {
1✔
562
        case Some(m) => withReplacement(c, replacement, m)
1✔
563
        case _ => c
564
      }
565
      replacedChunks += res
1✔
566
    }
567

568
    if (detectedSeps.length > 0)
1✔
569
      replacedChunks.mkString(detectedSeps.head)
1✔
570
    else
571
      replacedChunks.mkString
1✔
572
  }
573
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc