• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4651025675

pending completion
4651025675

Pull #13742

github

GitHub
Merge d6d36ba2f into 7cb29641c
Pull Request #13742: Release/440 release candidate

275 of 275 new or added lines in 21 files covered. (100.0%)

8638 of 13101 relevant lines covered (65.93%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/NerConverter.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.ner
18

19
import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
20
import com.johnsnowlabs.nlp.annotators.common.NerTagged
21
import com.johnsnowlabs.nlp._
22
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
23
import org.apache.spark.ml.util.Identifiable
24

25
import scala.collection.immutable.Map
26

27
/** Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens
28
  * of recognized entities and their label. Results in `CHUNK` Annotation type.
29
  *
30
  * NER chunks can then be filtered by setting a whitelist with `setWhiteList`. Chunks with no
31
  * associated entity (tagged "O") are filtered.
32
  *
33
  * See also
34
  * [[https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) Inside–outside–beginning (tagging)]]
35
  * for more information.
36
  *
37
  * ==Example==
38
  * This is a continuation of the example of the
39
  * [[com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel NerDLModel]]. See that class on how to
40
  * extract the entities.
41
  *
42
  * The output of the NerDLModel follows the Annotator schema and can be converted like so:
43
  * {{{
44
  * result.selectExpr("explode(ner)").show(false)
45
  * +----------------------------------------------------+
46
  * |col                                                 |
47
  * +----------------------------------------------------+
48
  * |[named_entity, 0, 2, B-ORG, [word -> U.N], []]      |
49
  * |[named_entity, 3, 3, O, [word -> .], []]            |
50
  * |[named_entity, 5, 12, O, [word -> official], []]    |
51
  * |[named_entity, 14, 18, B-PER, [word -> Ekeus], []]  |
52
  * |[named_entity, 20, 24, O, [word -> heads], []]      |
53
  * |[named_entity, 26, 28, O, [word -> for], []]        |
54
  * |[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]|
55
  * |[named_entity, 37, 37, O, [word -> .], []]          |
56
  * +----------------------------------------------------+
57
  * }}}
58
  * After the converter is used:
59
  * {{{
60
  * val converter = new NerConverter()
61
  *   .setInputCols("sentence", "token", "ner")
62
  *   .setOutputCol("entities")
63
  *   .setPreservePosition(false)
64
  *
65
  * converter.transform(result).selectExpr("explode(entities)").show(false)
66
  * +------------------------------------------------------------------------+
67
  * |col                                                                     |
68
  * +------------------------------------------------------------------------+
69
  * |[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []]      |
70
  * |[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []]  |
71
  * |[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]|
72
  * +------------------------------------------------------------------------+
73
  * }}}
74
  *
75
  * @groupname anno Annotator types
76
  * @groupdesc anno
77
  *   Required input and expected output annotator types
78
  * @groupname Ungrouped Members
79
  * @groupname param Parameters
80
  * @groupname setParam Parameter setters
81
  * @groupname getParam Parameter getters
82
  * @groupname Ungrouped Members
83
  * @groupprio param  1
84
  * @groupprio anno  2
85
  * @groupprio Ungrouped 3
86
  * @groupprio setParam  4
87
  * @groupprio getParam  5
88
  * @groupdesc param
89
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
90
  *   parameter values through setters and getters, respectively.
91
  */
92
class NerConverter(override val uid: String)
93
    extends AnnotatorModel[NerConverter]
94
    with HasSimpleAnnotate[NerConverter] {
95

96
  def this() = this(Identifiable.randomUID("NER_CONVERTER"))
×
97

98
  /** Input Annotator Type : DOCUMENT, TOKEN, NAMED_ENTITY
99
    *
100
    * @group anno
101
    */
102
  override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN, NAMED_ENTITY)
×
103

104
  /** Output Annotator Type : CHUNK
105
    *
106
    * @group anno
107
    */
108
  override val outputAnnotatorType: AnnotatorType = CHUNK
×
109

110
  /** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
111
    * on labels
112
    *
113
    * @group param
114
    */
115
  val whiteList: StringArrayParam = new StringArrayParam(
×
116
    this,
117
    "whiteList",
×
118
    "If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels")
×
119

120
  /** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
121
    * on labels
122
    *
123
    * @group setParam
124
    */
125
  def setWhiteList(list: String*): NerConverter.this.type = set(whiteList, list.toArray)
×
126

127
  /** Whether to preserve the original position of the tokens in the original document or use the
128
    * modified tokens (Default: `true`)
129
    *
130
    * @group param
131
    */
132
  val preservePosition: BooleanParam = new BooleanParam(
×
133
    this,
134
    "preservePosition",
×
135
    "Whether to preserve the original position of the tokens in the original document or use the modified tokens")
×
136

137
  /** Whether to preserve the original position of the tokens in the original document or use the
138
    * modified tokens (Default: `true`)
139
    *
140
    * @group setParam
141
    */
142
  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)
×
143

144
  /** set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
145
    *
146
    * @group param
147
    */
148
  val nerHasNoSchema: BooleanParam = new BooleanParam(
×
149
    this,
150
    "nerHasNoSchema",
×
151
    "set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema")
×
152

153
  /** @group setParam */
154
  def setNerHasNoSchema(value: Boolean): this.type = set(nerHasNoSchema, value)
×
155

156
  setDefault(preservePosition -> true, nerHasNoSchema -> false)
×
157

158
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
159
    val sentences = NerTagged.unpack(annotations)
×
160
    val docs = annotations.filter(a =>
×
161
      a.annotatorType == AnnotatorType.DOCUMENT && sentences.exists(b =>
×
162
        b.indexedTaggedWords.exists(c => c.begin >= a.begin && c.end <= a.end)))
×
163

164
    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
×
165
      NerTagsEncoding.fromIOB(
×
166
        sentence,
167
        doc._1,
×
168
        sentenceIndex = doc._2,
×
169
        originalOffset = $(preservePosition),
×
170
        nerHasNoSchema = $(nerHasNoSchema))
×
171
    }
172

173
    entities
174
      .filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity)))
×
175
      .zipWithIndex
×
176
      .map { case (entity, idx) =>
×
177
        val baseMetadata =
178
          Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
×
179
        val metadata =
180
          if (entity.confidence.isEmpty) baseMetadata
×
181
          else baseMetadata + ("confidence" -> entity.confidence.get.toString)
×
182
        Annotation(outputAnnotatorType, entity.start, entity.end, entity.text, metadata)
×
183

184
      }
185
  }
186

187
}
188

189
object NerConverter extends ParamsAndFeaturesReadable[NerConverter]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc