4651025675

Build Type

github

Committed by GitHub

Commit Message

Merge d6d36ba2f into 7cb29641c

Pull Request Pull Request #13742: Release/440 release candidate

Run Details

275 of 275 new or added lines in 21 files covered. (100.0%)

8638 of 13101 relevant lines covered (65.93%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/NerConverter.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.ner

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

import scala.collection.immutable.Map

/** Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens
  * of recognized entities and their label. Results in `CHUNK` Annotation type.
  *
  * NER chunks can then be filtered by setting a whitelist with `setWhiteList`. Chunks with no
  * associated entity (tagged "O") are filtered.
  *
  * See also
  * [[https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) Inside–outside–beginning (tagging)]]
  * for more information.
  *
  * ==Example==
  * This is a continuation of the example of the
  * [[com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel NerDLModel]]. See that class on how to
  * extract the entities.
  *
  * The output of the NerDLModel follows the Annotator schema and can be converted like so:
  * {{{
  * result.selectExpr("explode(ner)").show(false)
  * +----------------------------------------------------+
  * |col                                                 |
  * +----------------------------------------------------+
  * |[named_entity, 0, 2, B-ORG, [word -> U.N], []]      |
  * |[named_entity, 3, 3, O, [word -> .], []]            |
  * |[named_entity, 5, 12, O, [word -> official], []]    |
  * |[named_entity, 14, 18, B-PER, [word -> Ekeus], []]  |
  * |[named_entity, 20, 24, O, [word -> heads], []]      |
  * |[named_entity, 26, 28, O, [word -> for], []]        |
  * |[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]|
  * |[named_entity, 37, 37, O, [word -> .], []]          |
  * +----------------------------------------------------+
  * }}}
  * After the converter is used:
  * {{{
  * val converter = new NerConverter()
  *   .setInputCols("sentence", "token", "ner")
  *   .setOutputCol("entities")
  *   .setPreservePosition(false)
  *
  * converter.transform(result).selectExpr("explode(entities)").show(false)
  * +------------------------------------------------------------------------+
  * |col                                                                     |
  * +------------------------------------------------------------------------+
  * |[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []]      |
  * |[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []]  |
  * |[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]|
  * +------------------------------------------------------------------------+
  * }}}
  *
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class NerConverter(override val uid: String)
    extends AnnotatorModel[NerConverter]
    with HasSimpleAnnotate[NerConverter] {

  def this() = this(Identifiable.randomUID("NER_CONVERTER"))

  /** Input Annotator Type : DOCUMENT, TOKEN, NAMED_ENTITY
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN, NAMED_ENTITY)

  /** Output Annotator Type : CHUNK
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = CHUNK

  /** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
    * on labels
    *
    * @group param
    */
  val whiteList: StringArrayParam = new StringArrayParam(
    this,
    "whiteList",
    "If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels")

  /** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
    * on labels
    *
    * @group setParam
    */
  def setWhiteList(list: String*): NerConverter.this.type = set(whiteList, list.toArray)

  /** Whether to preserve the original position of the tokens in the original document or use the
    * modified tokens (Default: `true`)
    *
    * @group param
    */
  val preservePosition: BooleanParam = new BooleanParam(
    this,
    "preservePosition",
    "Whether to preserve the original position of the tokens in the original document or use the modified tokens")

  /** Whether to preserve the original position of the tokens in the original document or use the
    * modified tokens (Default: `true`)
    *
    * @group setParam
    */
  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  /** set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
    *
    * @group param
    */
  val nerHasNoSchema: BooleanParam = new BooleanParam(
    this,
    "nerHasNoSchema",
    "set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema")

  /** @group setParam */
  def setNerHasNoSchema(value: Boolean): this.type = set(nerHasNoSchema, value)

  setDefault(preservePosition -> true, nerHasNoSchema -> false)

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = NerTagged.unpack(annotations)
    val docs = annotations.filter(a =>
      a.annotatorType == AnnotatorType.DOCUMENT && sentences.exists(b =>
        b.indexedTaggedWords.exists(c => c.begin >= a.begin && c.end <= a.end)))

    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
      NerTagsEncoding.fromIOB(
        sentence,
        doc._1,
        sentenceIndex = doc._2,
        originalOffset = $(preservePosition),
        nerHasNoSchema = $(nerHasNoSchema))
    }

    entities
      .filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity)))
      .zipWithIndex
      .map { case (entity, idx) =>
        val baseMetadata =
          Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
        val metadata =
          if (entity.confidence.isEmpty) baseMetadata
          else baseMetadata + ("confidence" -> entity.confidence.get.toString)
        Annotation(outputAnnotatorType, entity.start, entity.end, entity.text, metadata)

      }
  }

}

object NerConverter extends ParamsAndFeaturesReadable[NerConverter]

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.ner
18
19	import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
20	import com.johnsnowlabs.nlp.annotators.common.NerTagged
21	import com.johnsnowlabs.nlp._
22	import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
23	import org.apache.spark.ml.util.Identifiable
24
25	import scala.collection.immutable.Map
26
27	/** Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens
28	* of recognized entities and their label. Results in `CHUNK` Annotation type.
29	*
30	* NER chunks can then be filtered by setting a whitelist with `setWhiteList`. Chunks with no
31	* associated entity (tagged "O") are filtered.
32	*
33	* See also
34	* [[https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) Inside–outside–beginning (tagging)]]
35	* for more information.
36	*
37	* ==Example==
38	* This is a continuation of the example of the
39	* [[com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel NerDLModel]]. See that class on how to
40	* extract the entities.
41	*
42	* The output of the NerDLModel follows the Annotator schema and can be converted like so:
43	* {{{
44	* result.selectExpr("explode(ner)").show(false)
45	* +----------------------------------------------------+
46	* \|col \|
47	* +----------------------------------------------------+
48	* \|[named_entity, 0, 2, B-ORG, [word -> U.N], []] \|
49	* \|[named_entity, 3, 3, O, [word -> .], []] \|
50	* \|[named_entity, 5, 12, O, [word -> official], []] \|
51	* \|[named_entity, 14, 18, B-PER, [word -> Ekeus], []] \|
52	* \|[named_entity, 20, 24, O, [word -> heads], []] \|
53	* \|[named_entity, 26, 28, O, [word -> for], []] \|
54	* \|[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]\|
55	* \|[named_entity, 37, 37, O, [word -> .], []] \|
56	* +----------------------------------------------------+
57	* }}}
58	* After the converter is used:
59	* {{{
60	* val converter = new NerConverter()
61	* .setInputCols("sentence", "token", "ner")
62	* .setOutputCol("entities")
63	* .setPreservePosition(false)
64	*
65	* converter.transform(result).selectExpr("explode(entities)").show(false)
66	* +------------------------------------------------------------------------+
67	* \|col \|
68	* +------------------------------------------------------------------------+
69	* \|[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []] \|
70	* \|[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []] \|
71	* \|[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]\|
72	* +------------------------------------------------------------------------+
73	* }}}
74	*
75	* @groupname anno Annotator types
76	* @groupdesc anno
77	* Required input and expected output annotator types
78	* @groupname Ungrouped Members
79	* @groupname param Parameters
80	* @groupname setParam Parameter setters
81	* @groupname getParam Parameter getters
82	* @groupname Ungrouped Members
83	* @groupprio param 1
84	* @groupprio anno 2
85	* @groupprio Ungrouped 3
86	* @groupprio setParam 4
87	* @groupprio getParam 5
88	* @groupdesc param
89	* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
90	* parameter values through setters and getters, respectively.
91	*/
92	class NerConverter(override val uid: String)
93	extends AnnotatorModel[NerConverter]
94	with HasSimpleAnnotate[NerConverter] {
95
96	def this() = this(Identifiable.randomUID("NER_CONVERTER"))	×
97
98	/** Input Annotator Type : DOCUMENT, TOKEN, NAMED_ENTITY
99	*
100	* @group anno
101	*/
102	override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN, NAMED_ENTITY)	×
103
104	/** Output Annotator Type : CHUNK
105	*
106	* @group anno
107	*/
108	override val outputAnnotatorType: AnnotatorType = CHUNK	×
109
110	/** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
111	* on labels
112	*
113	* @group param
114	*/
115	val whiteList: StringArrayParam = new StringArrayParam(	×
116	this,
117	"whiteList",	×
118	"If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels")	×
119
120	/** If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix
121	* on labels
122	*
123	* @group setParam
124	*/
125	def setWhiteList(list: String*): NerConverter.this.type = set(whiteList, list.toArray)	×
126
127	/** Whether to preserve the original position of the tokens in the original document or use the
128	* modified tokens (Default: `true`)
129	*
130	* @group param
131	*/
132	val preservePosition: BooleanParam = new BooleanParam(	×
133	this,
134	"preservePosition",	×
135	"Whether to preserve the original position of the tokens in the original document or use the modified tokens")	×
136
137	/** Whether to preserve the original position of the tokens in the original document or use the
138	* modified tokens (Default: `true`)
139	*
140	* @group setParam
141	*/
142	def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)	×
143
144	/** set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
145	*
146	* @group param
147	*/
148	val nerHasNoSchema: BooleanParam = new BooleanParam(	×
149	this,
150	"nerHasNoSchema",	×
151	"set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema")	×
152
153	/** @group setParam */
154	def setNerHasNoSchema(value: Boolean): this.type = set(nerHasNoSchema, value)	×
155
156	setDefault(preservePosition -> true, nerHasNoSchema -> false)	×
157
158	override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
159	val sentences = NerTagged.unpack(annotations)	×
160	val docs = annotations.filter(a =>	×
161	a.annotatorType == AnnotatorType.DOCUMENT && sentences.exists(b =>	×
162	b.indexedTaggedWords.exists(c => c.begin >= a.begin && c.end <= a.end)))	×
163
164	val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>	×
165	NerTagsEncoding.fromIOB(	×
166	sentence,
167	doc._1,	×
168	sentenceIndex = doc._2,	×
169	originalOffset = $(preservePosition),	×
170	nerHasNoSchema = $(nerHasNoSchema))	×
171	}
172
173	entities
174	.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity)))	×
175	.zipWithIndex	×
176	.map { case (entity, idx) =>	×
177	val baseMetadata =
178	Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)	×
179	val metadata =
180	if (entity.confidence.isEmpty) baseMetadata	×
181	else baseMetadata + ("confidence" -> entity.confidence.get.toString)	×
182	Annotation(outputAnnotatorType, entity.start, entity.end, entity.text, metadata)	×
183
184	}
185	}
186
187	}
188
189	object NerConverter extends ParamsAndFeaturesReadable[NerConverter]

JohnSnowLabs / spark-nlp / 4651025675

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous