• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 13883000244

16 Mar 2025 11:44AM CUT coverage: 59.034% (-1.0%) from 60.072%
13883000244

Pull #14444

github

web-flow
Merge 6d717703b into 05000ab4a
Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.11
/src/main/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerApproach.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.er
18

19
import com.johnsnowlabs.nlp.AnnotatorApproach
20
import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, TOKEN}
21
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
22
import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark
23
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
24
import com.johnsnowlabs.storage.Database.Name
25
import com.johnsnowlabs.storage._
26
import com.johnsnowlabs.util.JsonParser
27
import org.apache.spark.ml.PipelineModel
28
import org.apache.spark.ml.param.BooleanParam
29
import org.apache.spark.ml.util.Identifiable
30
import org.apache.spark.sql.functions.{col, collect_set, concat, flatten, lit}
31
import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
32
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
33

34
import scala.collection.mutable
35
import scala.collection.mutable.ArrayBuffer
36
import scala.io.Source
37

38
/** Fits an Annotator to match exact strings or regex patterns provided in a file against a
39
  * Document and assigns them an named entity. The definitions can contain any number of named
40
  * entities.
41
  *
42
  * There are multiple ways and formats to set the extraction resource. It is possible to set it
43
  * either as a "JSON", "JSONL" or "CSV" file. A path to the file needs to be provided to
44
  * `setPatternsResource`. The file format needs to be set as the "format" field in the `option`
45
  * parameter map and depending on the file type, additional parameters might need to be set.
46
  *
47
  * If the file is in a JSON format, then the rule definitions need to be given in a list with the
48
  * fields "id", "label" and "patterns":
49
  * {{{
50
  *  [
51
  *   {
52
  *     "id": "person-regex",
53
  *     "label": "PERSON",
54
  *     "patterns": ["\\w+\\s\\w+", "\\w+-\\w+"]
55
  *   },
56
  *   {
57
  *     "id": "locations-words",
58
  *     "label": "LOCATION",
59
  *     "patterns": ["Winterfell"]
60
  *   }
61
  * ]
62
  * }}}
63
  *
64
  * The same fields also apply to a file in the JSONL format:
65
  * {{{
66
  * {"id": "names-with-j", "label": "PERSON", "patterns": ["Jon", "John", "John Snow"]}
67
  * {"id": "names-with-s", "label": "PERSON", "patterns": ["Stark", "Snow"]}
68
  * {"id": "names-with-e", "label": "PERSON", "patterns": ["Eddard", "Eddard Stark"]}
69
  * }}}
70
  *
71
  * In order to use a CSV file, an additional parameter "delimiter" needs to be set. In this case,
72
  * the delimiter might be set by using `.setPatternsResource("patterns.csv", ReadAs.TEXT,
73
  * Map("format"->"csv", "delimiter" -> "\\|"))`
74
  * {{{
75
  * PERSON|Jon
76
  * PERSON|John
77
  * PERSON|John Snow
78
  * LOCATION|Winterfell
79
  * }}}
80
  *
81
  * ==Example==
82
  * In this example, the entities file as the form of
83
  * {{{
84
  * PERSON|Jon
85
  * PERSON|John
86
  * PERSON|John Snow
87
  * LOCATION|Winterfell
88
  * }}}
89
  * where each line represents an entity and the associated string delimited by "|".
90
  *
91
  * {{{
92
  * import spark.implicits._
93
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
94
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
95
  * import com.johnsnowlabs.nlp.annotators.er.EntityRulerApproach
96
  * import com.johnsnowlabs.nlp.util.io.ReadAs
97
  *
98
  * import org.apache.spark.ml.Pipeline
99
  *
100
  * val documentAssembler = new DocumentAssembler()
101
  *   .setInputCol("text")
102
  *   .setOutputCol("document")
103
  *
104
  * val tokenizer = new Tokenizer()
105
  *   .setInputCols("document")
106
  *   .setOutputCol("token")
107
  *
108
  * val entityRuler = new EntityRulerApproach()
109
  *   .setInputCols("document", "token")
110
  *   .setOutputCol("entities")
111
  *   .setPatternsResource(
112
  *     path = "src/test/resources/entity-ruler/patterns.csv",
113
  *     readAs = ReadAs.TEXT,
114
  *     options = Map("format" -> "csv", "delimiter" -> "\\|")
115
  *   )
116
  *
117
  * val pipeline = new Pipeline().setStages(Array(
118
  *   documentAssembler,
119
  *   tokenizer,
120
  *   entityRuler
121
  * ))
122
  *
123
  * val data = Seq("Jon Snow wants to be lord of Winterfell.").toDF("text")
124
  * val result = pipeline.fit(data).transform(data)
125
  *
126
  * result.selectExpr("explode(entities)").show(false)
127
  * +--------------------------------------------------------------------+
128
  * |col                                                                 |
129
  * +--------------------------------------------------------------------+
130
  * |[chunk, 0, 2, Jon, [entity -> PERSON, sentence -> 0], []]           |
131
  * |[chunk, 29, 38, Winterfell, [entity -> LOCATION, sentence -> 0], []]|
132
  * +--------------------------------------------------------------------+
133
  * }}}
134
  *
135
  * @param uid
136
  *   required uid for storing annotator to disk
137
  * @groupname anno Annotator types
138
  * @groupdesc anno
139
  *   Required input and expected output annotator types
140
  * @groupname Ungrouped Members
141
  * @groupname param Parameters
142
  * @groupname setParam Parameter setters
143
  * @groupname getParam Parameter getters
144
  * @groupname Ungrouped Members
145
  * @groupprio param  1
146
  * @groupprio anno  2
147
  * @groupprio Ungrouped 3
148
  * @groupprio setParam  4
149
  * @groupprio getParam  5
150
  * @groupdesc param
151
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
152
  *   parameter values through setters and getters, respectively.
153
  */
154
class EntityRulerApproach(override val uid: String)
155
    extends AnnotatorApproach[EntityRulerModel]
156
    with HasStorage {
157

158
  def this() = this(Identifiable.randomUID("ENTITY_RULER"))
1✔
159

160
  override val description: String = "Entity Ruler matches entities based on text patterns"
1✔
161

162
  private var entitiesForRegex: Array[String] = Array()
1✔
163
  private val keywordsPatterns: ArrayBuffer[EntityPattern] = ArrayBuffer()
1✔
164
  private var regexPatterns: Map[String, Seq[String]] = Map()
1✔
165

166
  /** Resource in JSON or CSV format to map entities to patterns (Default: `null`).
167
    *
168
    * @group param
169
    */
170
  val patternsResource: ExternalResourceParam = new ExternalResourceParam(
1✔
171
    this,
172
    "patternsResource",
1✔
173
    "Resource in JSON or CSV format to map entities to patterns")
1✔
174

175
  val sentenceMatch = new BooleanParam(
1✔
176
    this,
177
    "sentenceMatch",
1✔
178
    "Whether to find match at sentence level (regex only). True: sentence level. False: token level")
1✔
179

180
  /** Whether to use RocksDB storage to serialize patterns (Default: `true`).
181
    *
182
    * @group param
183
    */
184
  val useStorage =
185
    new BooleanParam(this, "useStorage", "Whether to use RocksDB storage to serialize patterns")
1✔
186

187
  val alphabet = new ExternalResourceParam(
1✔
188
    this,
189
    "alphabet",
1✔
190
    "Alphabet resource path to plain text file with all characters in a given alphabet")
1✔
191

192
  /** @group setParam */
193
  def setPatternsResource(
194
      path: String,
195
      readAs: ReadAs.Format,
196
      options: Map[String, String] = Map("format" -> "JSON")): this.type =
197
    set(patternsResource, ExternalResource(path, readAs, options))
1✔
198

199
  def setSentenceMatch(value: Boolean): this.type = set(sentenceMatch, value)
1✔
200

201
  /** @group setParam */
202
  def setUseStorage(value: Boolean): this.type = set(useStorage, value)
1✔
203

204
  /** @group setParam */
205
  def setAlphabetResource(path: String): this.type = {
206
    set(alphabet, ExternalResource(path, ReadAs.TEXT, Map()))
×
207
  }
208

209
  setDefault(
1✔
210
    storagePath -> ExternalResource("", ReadAs.TEXT, Map()),
1✔
211
    patternsResource -> null,
1✔
212
    useStorage -> false,
1✔
213
    sentenceMatch -> false,
1✔
214
    caseSensitive -> true,
1✔
215
    alphabet -> ExternalResource("english", ReadAs.TEXT, Map()))
1✔
216

217
  private val AVAILABLE_FORMATS = Array("JSON", "JSONL", "CSV")
1✔
218

219
  override def beforeTraining(spark: SparkSession): Unit = {
220
    validateParameters()
1✔
221
  }
222

223
  override def train(
224
      dataset: Dataset[_],
225
      recursivePipeline: Option[PipelineModel]): EntityRulerModel = {
226

227
    val entityRuler = new EntityRulerModel()
1✔
228

229
    if ($(useStorage)) {
1✔
230
      entityRuler
231
        .setStorageRef($(storageRef))
1✔
232
        .setUseStorage($(useStorage))
1✔
233

234
    } else {
1✔
235
      storePatterns(None)
1✔
236
      val entityRulerFeatures = EntityRulerFeatures(Map(), regexPatterns)
1✔
237
      entityRuler
238
        .setUseStorage($(useStorage))
239
        .setEntityRulerFeatures(entityRulerFeatures)
1✔
240
    }
241

242
    var automaton: Option[AhoCorasickAutomaton] = None
1✔
243
    if (keywordsPatterns.nonEmpty) {
1✔
244
      val alphabet = EntityRulerUtil.loadAlphabet($(this.alphabet).path)
1✔
245
      automaton = Some(
1✔
246
        new AhoCorasickAutomaton(alphabet, keywordsPatterns.toArray, $(caseSensitive)))
1✔
247
    }
248

249
    entityRuler
250
      .setRegexEntities(entitiesForRegex)
251
      .setAhoCorasickAutomaton(automaton)
1✔
252

253
  }
254

255
  protected def index(
256
      fitDataset: Dataset[_],
257
      storageSourcePath: Option[String],
258
      readAs: Option[ReadAs.Value],
259
      writers: Map[Name, StorageWriter[_]],
260
      readOptions: Option[Map[String, String]]): Unit = {
261

262
    validateParameters()
1✔
263

264
    if ($(useStorage)) {
1✔
265
      val storageWriter =
266
        writers(Database.ENTITY_REGEX_PATTERNS).asInstanceOf[RegexPatternsReadWriter]
1✔
267
      storePatterns(Some(storageWriter))
1✔
268
    }
269

270
  }
271

272
  private def storePatterns(storageWriter: Option[RegexPatternsReadWriter]): Unit = {
273

274
    resourceFormats match {
275
      case "JSON&TEXT" => storePatternsFromJson(storageWriter)
1✔
276
      case "JSONL&TEXT" => storePatternsFromJsonl(storageWriter)
1✔
277
      case "JSON&SPARK" => storePatternsFromJSONDataFrame(storageWriter, "JSON")
1✔
278
      case "JSONL&SPARK" => storePatternsFromJSONDataFrame(storageWriter, "JSONL")
1✔
279
      case "CSV&TEXT" => storePatternsFromCSV(storageWriter)
1✔
280
      case "CSV&SPARK" => storeEntityPatternsFromCSVDataFrame(storageWriter)
1✔
281
      case _ @format => throw new IllegalArgumentException(s"format $format not available")
×
282
    }
283
  }
284

285
  private def validateParameters(): Unit = {
286
    require($(patternsResource) != null, "patternsResource parameter required")
1✔
287
    require($(patternsResource).path != "", "path for a patternsResource file is required")
1✔
288
    require(
1✔
289
      AVAILABLE_FORMATS.contains(
1✔
290
        $(patternsResource).options.getOrElse("format", "").toUpperCase()),
1✔
291
      "format option parameter required with either JSON or CSV values")
1✔
292
    if ($(patternsResource).options("format").toUpperCase() == "CSV") {
1✔
293
      require(
1✔
294
        $(patternsResource).options.getOrElse("delimiter", "") != "",
1✔
295
        "delimiter option parameter required")
1✔
296
    }
297
    require($(patternsResource).readAs != null, "readAs parameter required")
1✔
298
  }
299

300
  private lazy val resourceFormats: String = $(patternsResource)
301
    .options("format")
302
    .toUpperCase() + "&" + $(patternsResource).readAs
303

304
  private def storePatternsFromJson(storageReadWriter: Option[RegexPatternsReadWriter]): Unit = {
305

306
    val entityPatterns: Array[EntityPattern] = parseJSON()
1✔
307

308
    entityPatterns.foreach { entityPattern =>
1✔
309
      if (entityPattern.regex.getOrElse(false)) {
1✔
310
        storeEntityPattern(entityPattern, storageReadWriter)
1✔
311
      } else {
312
        keywordsPatterns.append(entityPattern)
1✔
313
      }
314
    }
315
  }
316

317
  private def storeEntityPattern(
318
      entityPattern: EntityPattern,
319
      storageReadWriter: Option[RegexPatternsReadWriter]): Unit = {
320
    val entity =
321
      if (entityPattern.id.isDefined) s"${entityPattern.label},${entityPattern.id.get}"
1✔
322
      else entityPattern.label
1✔
323
    storageReadWriter.getOrElse(None) match {
1✔
324
      case patternsWriter: PatternsReadWriter =>
325
        storePatterns(entityPattern.patterns.toIterator, entity, patternsWriter)
×
326
      case regexPatternsWriter: RegexPatternsReadWriter =>
327
        storeRegexPattern(entityPattern.patterns, entity, regexPatternsWriter)
1✔
328
      case None => {
329
        val isRegex = entityPattern.regex.getOrElse(false)
1✔
330
        computePatterns(entityPattern.patterns, isRegex, entity)
1✔
331
      }
332
    }
333
  }
334

335
  private def parseJSON(): Array[EntityPattern] = {
336
    val stream = ResourceHelper.getResourceStream($(patternsResource).path)
1✔
337
    val jsonContent = Source.fromInputStream(stream).mkString
1✔
338
    val entityPatterns: Array[EntityPattern] = JsonParser.parseArray[EntityPattern](jsonContent)
1✔
339

340
    entityPatterns
341
  }
342

343
  private def storePatternsFromJsonl(storageReadWriter: Option[RegexPatternsReadWriter]): Unit = {
344

345
    val sourceStream = ResourceHelper.SourceStream($(patternsResource).path)
1✔
346

347
    sourceStream.content.foreach(content =>
1✔
348
      content.foreach { line =>
1✔
349
        val entityPattern: EntityPattern = JsonParser.parseObject[EntityPattern](line)
1✔
350
        if (entityPattern.regex.getOrElse(false)) {
1✔
351
          storeEntityPattern(entityPattern, storageReadWriter)
1✔
352
        } else keywordsPatterns.append(entityPattern)
1✔
353
      })
354
  }
355

356
  private def storePatternsFromCSV(regexPatternsWriter: Option[RegexPatternsReadWriter]): Unit = {
357

358
    val delimiter: String = $(patternsResource).options("delimiter")
1✔
359
    val patternsLines = ResourceHelper.parseLines($(patternsResource))
1✔
360
    val regexList: ArrayBuffer[String] = ArrayBuffer()
1✔
361
    val keywords: mutable.Map[String, Seq[String]] = mutable.Map()
1✔
362
    val regexPatterns: mutable.Map[String, Seq[String]] = mutable.Map()
1✔
363
    var patternsHasRegex = false
1✔
364

365
    val groupByLabel =
366
      patternsLines.groupBy(pattern => EntityRulerUtil.splitString(pattern, delimiter)(0))
1✔
367
    groupByLabel.foreach { case (label, lines) =>
1✔
368
      lines.foreach { line =>
1✔
369
        val columns: Array[String] = EntityRulerUtil.splitString(line, delimiter)
1✔
370
        val pattern = columns(1)
1✔
371
        val isRegex = if (columns.length == 2) false else EntityRulerUtil.toBoolean(columns(2))
×
372

373
        if (isRegex) {
1✔
374
          regexList.append(pattern)
1✔
375
          patternsHasRegex = true
1✔
376
        } else {
1✔
377
          val patterns = keywords.getOrElse(label, Seq())
1✔
378
          keywords(label) = patterns ++ Seq(pattern)
1✔
379
        }
380
      }
381

382
      if (regexPatternsWriter.isEmpty) {
1✔
383
        regexPatterns(label) = regexList
1✔
384
      }
385

386
      if (patternsHasRegex && regexPatternsWriter.nonEmpty) {
1✔
387
        storeRegexPattern(regexList, label, regexPatternsWriter.get)
1✔
388
      }
389

390
      keywords.foreach { case (label, patterns) =>
1✔
391
        keywordsPatterns.append(EntityPattern(label, patterns))
1✔
392
      }
393
      keywords.clear()
1✔
394
    }
395

396
    if (regexPatternsWriter.isEmpty) {
1✔
397
      this.regexPatterns = regexPatterns.toMap
1✔
398
      if (patternsHasRegex) entitiesForRegex = regexPatterns.keys.toArray
1✔
399
    }
400

401
  }
402

403
  private def storeEntityPatternsFromCSVDataFrame(
404
      storageReadWriter: Option[RegexPatternsReadWriter]): Unit = {
405

406
    val patternOptions = $(patternsResource).options
1✔
407
    val patternsSchema = StructType(
1✔
408
      Array(
1✔
409
        StructField("label", StringType, nullable = false),
1✔
410
        StructField("pattern", StringType, nullable = false),
1✔
411
        StructField("regex", BooleanType, nullable = true)))
1✔
412

413
    val patternsDataFrame = spark.read
414
      .format(patternOptions("format"))
1✔
415
      .options(patternOptions)
416
      .option("delimiter", patternOptions("delimiter"))
1✔
417
      .schema(patternsSchema)
418
      .load($(patternsResource).path)
1✔
419
      .na
420
      .fill(value = false, Array("regex"))
1✔
421

422
    val groupedByPatternsDataFrame = patternsDataFrame
423
      .groupBy("label", "regex")
1✔
424
      .agg(collect_set("pattern").alias("patterns"))
1✔
425

426
    storeFromDataFrame(
1✔
427
      groupedByPatternsDataFrame,
428
      idFieldExist = false,
429
      regexFieldExist = true,
430
      storageReadWriter)
431

432
  }
433

434
  private def storePatternsFromJSONDataFrame(
435
      storageReadWriter: Option[RegexPatternsReadWriter],
436
      format: String): Unit = {
437

438
    val path = $(patternsResource).path
1✔
439

440
    val dataFrameReader = spark.read
1✔
441
    if (format.equals("JSON")) {
1✔
442
      dataFrameReader.option("multiline", "true")
1✔
443
    }
444

445
    var patternsDataFrame = dataFrameReader
446
      .json(path)
1✔
447

448
    val idField: Array[StructField] =
449
      patternsDataFrame.schema.fields.filter(field => field.name == "id")
1✔
450
    val regexField: Array[StructField] =
451
      patternsDataFrame.schema.fields.filter(field => field.name == "regex")
1✔
452

453
    if (regexField.isEmpty) {
1✔
454
      patternsDataFrame = patternsDataFrame.withColumn("regex", lit(false))
1✔
455
    } else {
456
      patternsDataFrame = patternsDataFrame.na.fill(value = false, Array("regex"))
1✔
457
    }
458
    if (idField.nonEmpty) patternsDataFrame.na.drop()
1✔
459

460
    storeFromDataFrame(
1✔
461
      patternsDataFrame,
462
      idField.nonEmpty,
1✔
463
      regexField.nonEmpty,
1✔
464
      storageReadWriter)
465
  }
466

467
  private def storeFromDataFrame(
468
      patternsDataFrame: DataFrame,
469
      idFieldExist: Boolean,
470
      regexFieldExist: Boolean,
471
      storageReadWriter: Option[RegexPatternsReadWriter]): Unit = {
472

473
    val regexPatternsDataFrame = patternsDataFrame.filter(col("regex") === true)
1✔
474
    val cleanedRegexPatternsDataFrame =
475
      cleanPatternsDataFrame(regexPatternsDataFrame, idFieldExist)
1✔
476

477
    cleanedRegexPatternsDataFrame.rdd.toLocalIterator.foreach { row =>
1✔
478
      val patterns = row.getAs[Seq[String]]("flatten_patterns")
1✔
479
      val entity =
480
        if (idFieldExist) row.getAs[String]("label_id") else row.getAs[String]("label")
1✔
481
      storageReadWriter.getOrElse(None) match {
1✔
482
        case patternsWriter: PatternsReadWriter =>
483
          storePatterns(patterns.toIterator, entity, patternsWriter)
×
484
        case regexPatternsWriter: RegexPatternsReadWriter =>
485
          storeRegexPattern(patterns, entity, regexPatternsWriter)
1✔
486
        case None => computePatterns(patterns, isRegex = true, entity)
1✔
487
      }
488
    }
489

490
    val keywordsDataFrame = patternsDataFrame.filter(col("regex") === false)
1✔
491
    val cleanedKeywordsDataFrame = cleanPatternsDataFrame(keywordsDataFrame, idFieldExist)
1✔
492

493
    cleanedKeywordsDataFrame.rdd.toLocalIterator.foreach { row =>
1✔
494
      val patterns = row.getAs[Seq[String]]("flatten_patterns")
1✔
495
      if (idFieldExist) {
1✔
496
        val labelId = row.getAs[String]("label_id")
1✔
497
        val label = labelId.split(",")(0)
1✔
498
        val id = labelId.split(",")(1)
1✔
499
        keywordsPatterns.append(EntityPattern(label, patterns, Some(id), Some(true)))
1✔
500
      } else {
1✔
501
        val label = row.getAs[String]("label")
1✔
502
        keywordsPatterns.append(EntityPattern(label, patterns, None, Some(true)))
1✔
503
      }
504

505
    }
506

507
  }
508

509
  private def cleanPatternsDataFrame(
510
      patternsDataFrame: DataFrame,
511
      idFieldExist: Boolean): DataFrame = {
512

513
    if (idFieldExist) {
1✔
514
      val patternsWithIdDataFrame =
515
        patternsDataFrame.withColumn("label_id", concat(col("label"), lit(","), col("id")))
1✔
516

517
      patternsWithIdDataFrame
518
        .groupBy("label_id")
1✔
519
        .agg(flatten(collect_set("patterns")).as("flatten_patterns"))
1✔
520
    } else {
521
      patternsDataFrame
522
        .groupBy("label")
1✔
523
        .agg(flatten(collect_set("patterns")).as("flatten_patterns"))
1✔
524
    }
525
  }
526

527
  private def storePatterns(
528
      patterns: Iterator[String],
529
      entity: String,
530
      patternsReaderWriter: PatternsReadWriter): Unit = {
531
    patterns.foreach(pattern => storePattern(pattern, entity, patternsReaderWriter))
×
532
  }
533

534
  private def storePattern(
535
      pattern: String,
536
      entity: String,
537
      patternsReaderWriter: PatternsReadWriter): Unit = {
538
    patternsReaderWriter.lookup(pattern).getOrElse(patternsReaderWriter.add(pattern, entity))
×
539
  }
540

541
  private def storeRegexPattern(
542
      pattern: Seq[String],
543
      entity: String,
544
      regexPatternsReaderWriter: RegexPatternsReadWriter): Unit = {
545

UNCOV
546
    if (!entitiesForRegex.contains(entity)) {
×
547
      entitiesForRegex = entitiesForRegex ++ Array(entity)
1✔
548
    }
549
    regexPatternsReaderWriter
550
      .lookup(entity)
551
      .getOrElse(regexPatternsReaderWriter.add(entity, pattern))
1✔
552
  }
553

554
  protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = {
555
    new RegexPatternsReadWriter(connection)
1✔
556
  }
557

558
  override def indexStorage(fitDataset: Dataset[_], resource: Option[ExternalResource]): Unit = {
559
    if ($(useStorage)) {
1✔
560
      super.indexStorage(fitDataset, resource)
1✔
561
    }
562
  }
563

564
  private def computePatterns(patterns: Seq[String], isRegex: Boolean, entity: String): Unit = {
565
    if (isRegex) {
1✔
566
      regexPatterns = regexPatterns ++ Map(entity -> patterns)
1✔
567
      if (!entitiesForRegex.contains(entity)) {
×
568
        entitiesForRegex = entitiesForRegex ++ Array(entity)
1✔
569
      }
570
    }
571
  }
572

573
  /** Input annotator types: DOCUMENT, TOKEN
574
    *
575
    * @group anno
576
    */
577
  override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT)
1✔
578
  override val optionalInputAnnotatorTypes: Array[String] = Array(TOKEN)
1✔
579

580
  /** Output annotator types: CHUNK
581
    *
582
    * @group anno
583
    */
584
  override val outputAnnotatorType: AnnotatorType = CHUNK
1✔
585

586
  override protected val databases: Array[Name] = EntityRulerModel.databases
1✔
587

588
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc