• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4951808959

pending completion
4951808959

Pull #13792

github

GitHub
Merge efe6b42df into ef7906c5e
Pull Request #13792: SPARKNLP-825 Adding multilabel param

7 of 7 new or added lines in 1 file covered. (100.0%)

8637 of 13128 relevant lines covered (65.79%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.7
/src/main/scala/com/johnsnowlabs/nlp/annotators/er/AhoCorasickAutomaton.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
package com.johnsnowlabs.nlp.annotators.er
17

18
import com.johnsnowlabs.nlp.Annotation
19
import com.johnsnowlabs.nlp.AnnotatorType.CHUNK
20
import com.johnsnowlabs.nlp.annotators.common.Sentence
21

22
import scala.collection.mutable.ArrayBuffer
23

24
/** Aho-Corasick Algorithm: https://dl.acm.org/doi/10.1145/360825.360855 A simple, efficient
25
  * algorithm to locate all occurrences of any of a finite number of keywords in a string of text.
26
  * The algorithm consists of constructing a finite state pattern matching machine from the
27
  * keywords and then using the pattern matching machine to process the text string in a single
28
  * pass. The complexity of constructing a pattern matching machine and searching the text is
29
  * linear to the total length of given patterns and the length of a text, respectively.
30
  */
31
class AhoCorasickAutomaton(
32
    var alphabet: String,
33
    patterns: Array[EntityPattern],
34
    caseSensitive: Boolean = false)
35
    extends Serializable {
36

37
  alphabet = if (alphabet.contains(" ")) alphabet else alphabet + " "
×
38
  private val flattenEntityPatterns: Array[FlattenEntityPattern] = patterns.flatMap {
1✔
39
    entityPattern =>
40
      entityPattern.patterns.map { pattern =>
1✔
41
        val keyword = if (caseSensitive) pattern else pattern.toLowerCase
1✔
42
        FlattenEntityPattern(entityPattern.label, keyword, entityPattern.id)
1✔
43
      }
44
  }
45

46
  private val ALPHABET_SIZE = alphabet.length
1✔
47
  private val MAX_NODES = flattenEntityPatterns.map(value => value.keyword.length).sum + 1
1✔
48

49
  var nodes: Array[Option[Node]] = Array.fill(MAX_NODES)(None)
1✔
50
  var nodeCount: Int = 1
1✔
51

52
  class Node extends Serializable {
53

54
    var parentState: Int = -1
1✔
55
    var charFromParent: Option[Char] = None
1✔
56
    var suffixLink: Int = -1
1✔
57
    var children: Array[Int] = Array.fill(ALPHABET_SIZE)(-1)
1✔
58
    var transitions: Array[Int] =
59
      Array.fill(ALPHABET_SIZE)(-1) // Transition Table aka Goto function
1✔
60
    var isLeaf: Boolean = false
1✔
61
    var entity: String = ""
1✔
62
    var id: String = ""
1✔
63
  }
64

65
  private val edges: Map[Char, Int] = alphabet.toCharArray.zipWithIndex.map {
1✔
66
    case (char, index) => (char, index)
1✔
67
  }.toMap
1✔
68

69
  initializeTrie()
1✔
70

71
  private def initializeTrie(): Unit = {
72
    nodes(0) = Some(new Node())
1✔
73
    nodes(0).get.suffixLink = 0
1✔
74
    nodes(0).get.parentState = -1
1✔
75

76
    flattenEntityPatterns.foreach(pattern => addPattern(pattern))
1✔
77
  }
78

79
  /** First step of Aho-Corasick algorithm: Build a Finite State Automaton as a keyword trie in
80
    * which the nodes represent the state and the edges between nodes are labeled by characters
81
    * that cause the transitions between nodes. The trie is an efficient implementation of a
82
    * dictionary of strings.
83
    */
84
  private def addPattern(pattern: FlattenEntityPattern): Unit = {
85
    var state = 0
1✔
86
    pattern.keyword.toCharArray.foreach { char =>
1✔
87
      val edgeIndex = edges.getOrElse(
1✔
88
        char, {
89
          val errorMessage = getAlphabetErrorMessage(char)
×
90
          throw new UnsupportedOperationException(errorMessage)
×
91
        })
92

93
      if (nodes(state).get.children(edgeIndex) == -1) {
1✔
94
        nodes(nodeCount) = Some(new Node())
1✔
95
        nodes(nodeCount).get.parentState = state
1✔
96
        nodes(nodeCount).get.charFromParent = Some(char)
1✔
97
        nodes(state).get.children(edgeIndex) = nodeCount
1✔
98
        nodeCount = nodeCount + 1
1✔
99
      }
100
      state = nodes(state).get.children(edgeIndex)
1✔
101
    }
102
    nodes(state).get.isLeaf = true
1✔
103
    nodes(state).get.entity = pattern.entity
1✔
104
    if (pattern.id.isDefined) nodes(state).get.id = pattern.id.get
1✔
105
  }
106

107
  /** Second step of Aho-Corasick algorithm: The algorithm starts at the input text’s beginning
108
    * and in the root state during searching for patterns. It processes the input string in a
109
    * single pass, and all occurrences of keywords are found, even if they overlap each other.
110
    */
111
  def searchPatternsInText(sentence: Sentence): Seq[Annotation] = {
112
    var previousState = 0
1✔
113
    val chunk: ArrayBuffer[(Char, Int)] = ArrayBuffer.empty
1✔
114
    val chunkAnnotations: ArrayBuffer[Annotation] = ArrayBuffer.empty
1✔
115

116
    sentence.content.zipWithIndex.foreach { case (char, index) =>
1✔
117
      val currentChar = if (caseSensitive) char else char.toLower
1✔
118
      val state = findNextState(previousState, currentChar)
1✔
119

120
      if (state > 0) {
1✔
121
        chunk.append((char, index))
1✔
122
      }
123

124
      if (state == 0 && previousState > 0) {
1✔
125
        val node = nodes(previousState).get
1✔
126
        if (node.isLeaf) {
1✔
127
          val chunkAnnotation = buildAnnotation(chunk, node.entity, node.id, sentence)
1✔
128
          chunkAnnotations.append(chunkAnnotation)
1✔
129
          chunk.clear()
1✔
130
        } else chunk.clear()
1✔
131
      }
132

133
      previousState = state
134
    }
135

136
    if (chunk.nonEmpty) {
1✔
137
      val node = nodes(previousState).get
1✔
138
      val chunkAnnotation = buildAnnotation(chunk, node.entity, node.id, sentence)
1✔
139
      chunkAnnotations.append(chunkAnnotation)
1✔
140
      chunk.clear()
1✔
141
    }
142

143
    chunkAnnotations
144
  }
145

146
  private def findNextState(state: Int, char: Char): Int = {
147

148
    val newLine = System.getProperty("line.separator")
1✔
149
    if (newLine == char.toString) return 0
1✔
150

151
    val edgeIndex: Int = edges.getOrElse(char, -1)
1✔
152
    if (edgeIndex == -1) {
1✔
153
      val errorMessage = getAlphabetErrorMessage(char)
1✔
154
      throw new UnsupportedOperationException(errorMessage)
1✔
155
    }
156

157
    val node = nodes(state)
1✔
158
    if (node.get.transitions(edgeIndex) == -1) {
1✔
159
      buildFailureLink(node.get, state, edgeIndex, char)
1✔
160
    }
161
    node.get.transitions(edgeIndex)
1✔
162
  }
163

164
  private def buildFailureLink(node: Node, state: Int, edgeIndex: Int, char: Char): Unit = {
165
    if (node.children(edgeIndex) != -1) {
1✔
166
      node.transitions(edgeIndex) = node.children(edgeIndex)
1✔
167
    } else {
168
      node.transitions(edgeIndex) =
1✔
169
        if (state == 0) 0 else findNextState(findSuffixLink(state), char)
1✔
170
    }
171
  }
172

173
  private def findSuffixLink(state: Int): Int = {
174
    val node = nodes(state)
1✔
175
    if (node.get.suffixLink == -1) {
1✔
176
      node.get.suffixLink =
1✔
177
        if (node.get.parentState == 0) 0
1✔
178
        else findNextState(findSuffixLink(node.get.parentState), node.get.charFromParent.get)
1✔
179
    }
180
    node.get.suffixLink
1✔
181
  }
182

183
  def buildAnnotation(
184
      chunk: ArrayBuffer[(Char, Int)],
185
      entity: String,
186
      id: String,
187
      sentence: Sentence): Annotation = {
188
    val begin = chunk.head._2 + sentence.start
1✔
189
    val end = chunk.last._2 + sentence.start
1✔
190
    val result = chunk.map(_._1).mkString("")
1✔
191
    val metadata = Map("entity" -> entity, "sentence" -> sentence.index.toString)
1✔
192

193
    if (id.isEmpty) {
1✔
194
      Annotation(CHUNK, begin, end, result, metadata)
1✔
195
    } else {
196
      Annotation(CHUNK, begin, end, result, metadata ++ Map("id" -> id))
1✔
197
    }
198

199
  }
200

201
  private def getAlphabetErrorMessage(char: Char): String = {
202
    val workshopURL = "https://github.com/JohnSnowLabs/spark-nlp/"
1✔
203
    val alphabetExample =
204
      "blob/master/examples/python/annotation/text/english/entity-ruler/EntityRuler_Alphabet.ipynb"
1✔
205
    val errorMessage: String =
206
      s"""Char $char not found in the alphabet. Your data could have unusual characters not found
1✔
207
         |in your document's language, which requires setting up a custom alphabet.
208
         |
209
         |Please set alphabet using setAlphabetResource parameter and make sure it has all
210
         |characters that can be found in your documents.
211
         |
212
         |You can check an example in Spark NLP Examples: $workshopURL$alphabetExample""".stripMargin
1✔
213

214
    errorMessage
215
  }
216

217
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc