• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4951808959

pending completion
4951808959

Pull #13792

github

GitHub
Merge efe6b42df into ef7906c5e
Pull Request #13792: SPARKNLP-825 Adding multilabel param

7 of 7 new or added lines in 1 file covered. (100.0%)

8637 of 13128 relevant lines covered (65.79%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.73
/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/NerTagsEncoding.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.ner
18

19
import com.johnsnowlabs.nlp.Annotation
20
import com.johnsnowlabs.nlp.annotators.common.Annotated.NerTaggedSentence
21

22
import scala.collection.mutable.ArrayBuffer
23

24
/** Works with different NER representations as tags Supports: IOB and IOB2
25
  * https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
26
  */
27
object NerTagsEncoding {
28

29
  /** Converts from IOB or IOB2 to list of NamedEntity
30
    * @param doc
31
    *   Source doc text
32
    * @return
33
    *   Extracted Named Entities
34
    */
35
  def fromIOB(
36
      sentence: NerTaggedSentence,
37
      doc: Annotation,
38
      sentenceIndex: Int = 0,
39
      originalOffset: Boolean = true,
40
      nerHasNoSchema: Boolean = false,
41
      includeNoneEntities: Boolean = false,
42
      format: String = "IOB2"): Seq[NamedEntity] = {
43

44
    val noChunk = "O"
1✔
45
    var beginningTagChunk = "B-"
1✔
46
    if (format != "IOB2") {
1✔
47
      beginningTagChunk = "I-"
×
48
    }
49

50
    val result = ArrayBuffer[NamedEntity]()
1✔
51

52
    val words = sentence.words.length
1✔
53

54
    var lastTag: Option[String] = None
1✔
55
    var lastTagStart = -1
1✔
56

57
    def flushEntity(startIdx: Int, endIdx: Int): Unit = {
58
      val start = sentence.indexedTaggedWords(startIdx).begin - doc.begin
1✔
59
      val end = sentence.indexedTaggedWords(endIdx).end - doc.begin
1✔
60
      require(
1✔
61
        start <= end && end <= doc.result.length,
1✔
62
        s"Failed to flush entities in NerConverter. " +
×
63
          s"Chunk offsets $start - $end are not within tokens:\n${sentence.words
×
64
              .mkString("||")}\nfor sentence:\n${doc.result}")
×
65
      val confidenceArray =
66
        sentence.indexedTaggedWords.slice(startIdx, endIdx + 1).flatMap(_.metadata.values)
1✔
67
      val finalConfidenceArray =
68
        try {
69
          confidenceArray.map(x => x.trim.toFloat)
1✔
70
        } catch {
71
          case _: Exception => Array.empty[Float]
×
72
        }
73
      val confidence =
74
        if (finalConfidenceArray.isEmpty) None
1✔
75
        else Some(finalConfidenceArray.sum / finalConfidenceArray.length)
×
76
      val content =
77
        if (originalOffset) doc.result.substring(start, end + 1)
1✔
78
        else sentence.indexedTaggedWords(startIdx).word
×
79
      val entity = NamedEntity(
1✔
80
        sentence.indexedTaggedWords(startIdx).begin,
1✔
81
        sentence.indexedTaggedWords(endIdx).end,
1✔
82
        lastTag.get,
1✔
83
        content,
84
        sentenceIndex.toString,
1✔
85
        confidence)
86
      result.append(entity)
1✔
87
      lastTag = None
1✔
88

89
    }
90

91
    def getTag(tag: String): Option[String] = {
92
      try {
93
        lastTag = Some(if (nerHasNoSchema) tag else tag.substring(2))
1✔
94
      } catch {
95
        case e: StringIndexOutOfBoundsException =>
96
          require(
×
97
            tag.length < 2,
×
98
            s"This annotator only supports IOB and IOB2 tagging: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) \n $e")
×
99
      }
100
      lastTag
101
    }
102

103
    for (i <- 0 until words) {
1✔
104
      val tag = sentence.tags(i)
1✔
105
      if (lastTag.isDefined && (tag.startsWith(beginningTagChunk) || tag == noChunk)) {
1✔
106
        flushEntity(lastTagStart, i - 1)
1✔
107
      }
108

109
      if (includeNoneEntities && lastTag.isEmpty) {
×
110
        lastTag = if (tag == noChunk) Some(tag) else getTag(tag)
×
111
        lastTagStart = i
112
      } else {
113
        if (lastTag.isEmpty && tag != noChunk) {
1✔
114
          lastTag = getTag(tag)
1✔
115
          lastTagStart = i
116
        }
117
      }
118
    }
119

120
    if (lastTag.isDefined) {
1✔
121
      flushEntity(lastTagStart, words - 1)
1✔
122
    }
123

124
    result.toList
1✔
125
  }
126

127
}
128

129
case class NamedEntity(
130
    start: Int,
131
    end: Int,
132
    entity: String,
133
    text: String,
134
    sentenceId: String,
135
    confidence: Option[Float])
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc