13883000244

Committed 16 Mar 2025 11:44AM CUT coverage: 59.034% (-1.0%) from 60.072%

Build # 13883000244

Build Type

Pull #14444

github

Committed by

web-flow

Commit Message

Merge 6d717703b into 05000ab4a

Pull Request Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

Run Details

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.85

/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/FeatureGenerator.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.ner.crf

import com.johnsnowlabs.ml.crf._
import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, WordpieceEmbeddingsSentence}

import scala.collection.mutable

/** Generates features for CrfBasedNer */
case class FeatureGenerator(dictFeatures: DictionaryFeatures) {

  val shapeEncoding = Map(
    '.' -> '.',
    ',' -> '.',
    ':' -> ':',
    ';' -> ':',
    '?' -> ':',
    '!' -> ':',
    '-' -> '-',
    '+' -> '-',
    '*' -> '-',
    '/' -> '-',
    '=' -> '-',
    '|' -> '-',
    '_' -> '-',
    '%' -> '-',
    '(' -> '(',
    '{' -> '(',
    '[' -> '(',
    '<' -> '(',
    ')' -> ')',
    '}' -> ')',
    ']' -> ')',
    '>' -> ')')

  def getShape(token: String) = {
    token.map(c => {
      if (c.isLower)
        'L'
      else if (c.isUpper)
        'U'
      else if (c.isDigit)
        'D'
      shapeEncoding.getOrElse(c, c)
    })
  }

  def shrink(str: String) = {
    val builder = new StringBuilder()
    for (c <- str) {
      if (builder.length == 0 || builder.last != c)
        builder.append(c)
    }
    builder.toString
  }

  object TokenType extends Enumeration {
    type TokenType = Value
    val AllUpper = Value(1 << 0)
    val AllDigit = Value(1 << 1)
    val AllSymbol = Value(1 << 2)
    val AllUpperDigit = Value(1 << 3)
    val AllUpperSymbol = Value(1 << 4)
    val AllDigitSymbol = Value(1 << 5)
    val AllUpperDigitSymbol = Value(1 << 6)
    val StartsUpper = Value(1 << 7)
    val AllLetter = Value(1 << 8)
    val AllAlnum = Value(1 << 9)

    val allTypes = values.max.id * 2 - 1
  }

  val digitDelims = Seq(',', '.')

  def getType(token: String): Int = {
    var types = TokenType.allTypes

    def remove(t: TokenType.TokenType) = {
      types = types & (~t.id)
    }

    var isFirst = true
    for (c <- token) {
      if (c.isUpper) {
        remove(TokenType.AllDigit)
        remove(TokenType.AllSymbol)
        remove(TokenType.AllDigitSymbol)
      } else if (c.isDigit || digitDelims.contains(c)) {
        remove(TokenType.AllUpper)
        remove(TokenType.AllSymbol)
        remove(TokenType.AllUpperSymbol)
        remove(TokenType.AllLetter)
      } else if (c.isLower) {
        remove(TokenType.AllUpper)
        remove(TokenType.AllDigit)
        remove(TokenType.AllSymbol)
        remove(TokenType.AllUpperDigit)
        remove(TokenType.AllUpperSymbol)
        remove(TokenType.AllDigitSymbol)
        remove(TokenType.AllUpperDigitSymbol)
      } else {
        remove(TokenType.AllUpper)
        remove(TokenType.AllDigit)
        remove(TokenType.AllUpperDigit)
        remove(TokenType.AllLetter)
        remove(TokenType.AllAlnum)
      }

      if (isFirst && !c.isUpper)
        remove(TokenType.StartsUpper)

      isFirst = false
    }

    val result = TokenType.values
      .filter(value => (value.id & types) > 0)
      .map(value => value.id)
      .headOption

    result.getOrElse(0)
  }

  def isDigitOrPredicate(token: String, predicate: Function[Char, Boolean]) = {
    var hasDigits = false
    var hasPredicate = false
    var hasOther = false
    for (c <- token) {
      hasDigits = hasDigits || c.isDigit
      hasPredicate = hasPredicate || predicate(c)
      hasOther = hasOther || !c.isLetterOrDigit
    }
    !hasOther && hasDigits && hasPredicate
  }

  def isAllSymbols(token: String) = {
    !token.forall(c => c.isLetterOrDigit)
  }

  def isShort(token: String) = {
    token.length == 2 && token(0).isUpper && token(1) == '.'
  }

  def containsUpper(token: String) = token.exists(c => c.isUpper)

  def containsLower(token: String) = token.exists(c => c.isLower)

  def containsLetter(token: String) = token.exists(c => c.isLetter)

  def containsDigit(token: String) = token.exists(c => c.isDigit)

  def containsSymbol(token: String) = token.exists(c => c.isLetterOrDigit)

  def getSuffix(token: String, size: Int, default: String = "") = {
    if (token.length >= size)
      token.substring(token.length - size).toLowerCase
    else
      default
  }

  def getPrefix(token: String, size: Int, default: String = "") = {
    if (token.length >= size)
      token.substring(0, size).toLowerCase
    else
      default
  }

  def fillFeatures(token: String): mutable.Map[String, String] = {
    val f = mutable.Map[String, String]()
    f("w") = token
    f("wl") = token.toLowerCase

    f("s") = getShape(token)
    f("h") = shrink(f("s"))
    f("t") = getType(token).toString

    f("p1") = getPrefix(token, 1)
    f("p2") = getPrefix(token, 2)
    f("p3") = getPrefix(token, 3)
    f("p4") = getPrefix(token, 4)

    f("s1") = getSuffix(token, 1)
    f("s2") = getSuffix(token, 2)
    f("s3") = getSuffix(token, 3)
    f("s4") = getSuffix(token, 4)

    f("dl") = isDigitOrPredicate(token, c => c.isLetter).toString
    f("d-") = isDigitOrPredicate(token, c => c == '-').toString
    f("d/") = isDigitOrPredicate(token, c => c == '/').toString
    f("d,") = isDigitOrPredicate(token, c => c == ',').toString
    f("d.") = isDigitOrPredicate(token, c => c == '.').toString

    f("u.") = isShort(token).toString
    f("iu") = (token.nonEmpty && token(0).isUpper).toString

    f("cu") = containsUpper(token).toString
    f("cl") = containsLower(token).toString
    f("ca") = containsLetter(token).toString
    f("cd") = containsDigit(token).toString
    f("cs") = containsSymbol(token).toString

    f
  }

  val pairs = Array("w", "pos", "h", "t")
  val window = 2

  def isInRange(idx: Int, size: Int) = idx >= 0 && idx < size

  def getName(source: String, idx: Int): String = {
    source + "~" + idx
  }

  def getName(source: String, idx1: Int, idx2: Int): String = {
    getName(source, idx1) + "|" + getName(source, idx2)
  }

  def generate(
      taggedSentence: TaggedSentence,
      wordpieceEmbeddingsSentence: WordpieceEmbeddingsSentence): TextSentenceAttrs = {

    val wordFeatures = taggedSentence.words
      .zip(taggedSentence.tags)
      .map { case (word, tag) =>
        val f = fillFeatures(word)
        f("pos") = tag
        f
      }

    val words = wordFeatures.length

    var wordsList = taggedSentence.words.toList
    val embeddings = wordpieceEmbeddingsSentence.tokens
      .filter(t => t.isWordStart)
      .map(t => t.embeddings)

    assert(
      embeddings.length == wordsList.length,
      "Mismatched embedding tokens and sentence tokens. Make sure you are properly " +
        "linking tokens and embeddings to the same inputCol DOCUMENT annotator")

    val attrs = (0 until words).map { i =>
      val pairAttrs = (-window until window)
        .filter(j => isInRange(i + j, words) && isInRange(i + j + 1, words))
        .flatMap(j =>
          pairs.map { name =>
            val feature = getName(name, j, j + 1)
            val value1 = wordFeatures(i + j).getOrElse(name, "")
            val value2 = wordFeatures(i + j + 1).getOrElse(name, "")
            (feature, value1 + "|" + value2)
          })
        .toArray

      val unoAttrs = (-window to window)
        .filter(j => isInRange(i + j, words))
        .flatMap { j =>
          wordFeatures(i + j).map { case (name, value) =>
            (getName(name, j), value)
          }
        }
        .toArray

      val dictAttrs = dictFeatures.get(wordsList).map((getName("dt", i), _))
      wordsList = wordsList.tail

      val addition =
        if (i == 0) Array(("_BOS_", ""))
        else if (i == words - 1) Array(("_EOS_", ""))
        else Array.empty[(String, String)]

      val binAttrs = pairAttrs ++ unoAttrs ++ dictAttrs ++ addition

      val numAttrs = embeddings(i)

      WordAttrs(binAttrs, numAttrs)
    }

    TextSentenceAttrs(attrs)
  }

  def generateDataset(sentences: TraversableOnce[
    (TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence)]): CrfDataset = {
    val textDataset = sentences
      .filter(p => p._2.words.length > 0)
      .map { case (labels, sentence, withEmbeddings) =>
        val textSentence = generate(sentence, withEmbeddings)
        (labels, textSentence)
      }

    DatasetReader.encodeDataset(textDataset)
  }

  def generate(
      sentence: TaggedSentence,
      withEmbeddings: WordpieceEmbeddingsSentence,
      metadata: DatasetMetadata): Instance = {
    val attrSentence = generate(sentence, withEmbeddings)

    DatasetReader.encodeSentence(attrSentence, metadata)
  }
}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.ner.crf
18
19	import com.johnsnowlabs.ml.crf._
20	import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, WordpieceEmbeddingsSentence}
21
22	import scala.collection.mutable
23
24	/** Generates features for CrfBasedNer */
25	case class FeatureGenerator(dictFeatures: DictionaryFeatures) {
26
27	val shapeEncoding = Map(	1✔
28	'.' -> '.',	1✔
29	',' -> '.',	1✔
30	':' -> ':',	1✔
31	';' -> ':',	1✔
32	'?' -> ':',	1✔
33	'!' -> ':',	1✔
34	'-' -> '-',	1✔
35	'+' -> '-',	1✔
36	'*' -> '-',	1✔
37	'/' -> '-',	1✔
38	'=' -> '-',	1✔
39	'\|' -> '-',	1✔
40	'_' -> '-',	1✔
41	'%' -> '-',	1✔
42	'(' -> '(',	1✔
43	'{' -> '(',	1✔
44	'[' -> '(',	1✔
45	'<' -> '(',	1✔
46	')' -> ')',	1✔
47	'}' -> ')',	1✔
48	']' -> ')',	1✔
49	'>' -> ')')	1✔
50
51	def getShape(token: String) = {
52	token.map(c => {	1✔
53	if (c.isLower)	1✔
54	'L'	1✔
55	else if (c.isUpper)	1✔
56	'U'	1✔
57	else if (c.isDigit)	1✔
58	'D'	×
59	shapeEncoding.getOrElse(c, c)	1✔
60	})
61	}
62
63	def shrink(str: String) = {
64	val builder = new StringBuilder()	1✔
65	for (c <- str) {	1✔
UNCOV 66	if (builder.length == 0 \|\| builder.last != c)	×
67	builder.append(c)	1✔
68	}
69	builder.toString	1✔
70	}
71
72	object TokenType extends Enumeration {
73	type TokenType = Value
74	val AllUpper = Value(1 << 0)	1✔
75	val AllDigit = Value(1 << 1)	1✔
76	val AllSymbol = Value(1 << 2)	1✔
77	val AllUpperDigit = Value(1 << 3)	1✔
78	val AllUpperSymbol = Value(1 << 4)	1✔
79	val AllDigitSymbol = Value(1 << 5)	1✔
80	val AllUpperDigitSymbol = Value(1 << 6)	1✔
81	val StartsUpper = Value(1 << 7)	1✔
82	val AllLetter = Value(1 << 8)	1✔
83	val AllAlnum = Value(1 << 9)	1✔
84
85	val allTypes = values.max.id * 2 - 1	1✔
86	}
87
88	val digitDelims = Seq(',', '.')	1✔
89
90	def getType(token: String): Int = {
91	var types = TokenType.allTypes	1✔
92
93	def remove(t: TokenType.TokenType) = {
94	types = types & (~t.id)	1✔
95	}
96
97	var isFirst = true	1✔
98	for (c <- token) {	1✔
99	if (c.isUpper) {	1✔
100	remove(TokenType.AllDigit)	1✔
101	remove(TokenType.AllSymbol)	1✔
102	remove(TokenType.AllDigitSymbol)	1✔
103	} else if (c.isDigit \|\| digitDelims.contains(c)) {	1✔
104	remove(TokenType.AllUpper)	1✔
105	remove(TokenType.AllSymbol)	1✔
106	remove(TokenType.AllUpperSymbol)	1✔
107	remove(TokenType.AllLetter)	1✔
108	} else if (c.isLower) {	1✔
109	remove(TokenType.AllUpper)	1✔
110	remove(TokenType.AllDigit)	1✔
111	remove(TokenType.AllSymbol)	1✔
112	remove(TokenType.AllUpperDigit)	1✔
113	remove(TokenType.AllUpperSymbol)	1✔
114	remove(TokenType.AllDigitSymbol)	1✔
115	remove(TokenType.AllUpperDigitSymbol)	1✔
116	} else {	×
117	remove(TokenType.AllUpper)	×
118	remove(TokenType.AllDigit)	×
119	remove(TokenType.AllUpperDigit)	×
120	remove(TokenType.AllLetter)	×
121	remove(TokenType.AllAlnum)	×
122	}
123
124	if (isFirst && !c.isUpper)	1✔
125	remove(TokenType.StartsUpper)	1✔
126
127	isFirst = false	1✔
128	}
129
130	val result = TokenType.values
131	.filter(value => (value.id & types) > 0)	1✔
132	.map(value => value.id)	1✔
133	.headOption	1✔
134
135	result.getOrElse(0)	1✔
136	}
137
138	def isDigitOrPredicate(token: String, predicate: Function[Char, Boolean]) = {
139	var hasDigits = false	1✔
140	var hasPredicate = false	1✔
141	var hasOther = false	1✔
142	for (c <- token) {	1✔
143	hasDigits = hasDigits \|\| c.isDigit	1✔
144	hasPredicate = hasPredicate \|\| predicate(c)	1✔
145	hasOther = hasOther \|\| !c.isLetterOrDigit	1✔
146	}
147	!hasOther && hasDigits && hasPredicate	1✔
148	}
149
150	def isAllSymbols(token: String) = {
151	!token.forall(c => c.isLetterOrDigit)	×
152	}
153
154	def isShort(token: String) = {
155	token.length == 2 && token(0).isUpper && token(1) == '.'	×
156	}
157
158	def containsUpper(token: String) = token.exists(c => c.isUpper)	1✔
159
160	def containsLower(token: String) = token.exists(c => c.isLower)	1✔
161
162	def containsLetter(token: String) = token.exists(c => c.isLetter)	1✔
163
164	def containsDigit(token: String) = token.exists(c => c.isDigit)	1✔
165
166	def containsSymbol(token: String) = token.exists(c => c.isLetterOrDigit)	1✔
167
168	def getSuffix(token: String, size: Int, default: String = "") = {
169	if (token.length >= size)	1✔
170	token.substring(token.length - size).toLowerCase	1✔
171	else
172	default	1✔
173	}
174
175	def getPrefix(token: String, size: Int, default: String = "") = {
176	if (token.length >= size)	1✔
177	token.substring(0, size).toLowerCase	1✔
178	else
179	default	1✔
180	}
181
182	def fillFeatures(token: String): mutable.Map[String, String] = {
183	val f = mutable.Map[String, String]()	1✔
184	f("w") = token	1✔
185	f("wl") = token.toLowerCase	1✔
186
187	f("s") = getShape(token)	1✔
188	f("h") = shrink(f("s"))	1✔
189	f("t") = getType(token).toString	1✔
190
191	f("p1") = getPrefix(token, 1)	1✔
192	f("p2") = getPrefix(token, 2)	1✔
193	f("p3") = getPrefix(token, 3)	1✔
194	f("p4") = getPrefix(token, 4)	1✔
195
196	f("s1") = getSuffix(token, 1)	1✔
197	f("s2") = getSuffix(token, 2)	1✔
198	f("s3") = getSuffix(token, 3)	1✔
199	f("s4") = getSuffix(token, 4)	1✔
200
201	f("dl") = isDigitOrPredicate(token, c => c.isLetter).toString	1✔
202	f("d-") = isDigitOrPredicate(token, c => c == '-').toString	1✔
203	f("d/") = isDigitOrPredicate(token, c => c == '/').toString	1✔
204	f("d,") = isDigitOrPredicate(token, c => c == ',').toString	1✔
205	f("d.") = isDigitOrPredicate(token, c => c == '.').toString	1✔
206
207	f("u.") = isShort(token).toString	1✔
208	f("iu") = (token.nonEmpty && token(0).isUpper).toString	1✔
209
210	f("cu") = containsUpper(token).toString	1✔
211	f("cl") = containsLower(token).toString	1✔
212	f("ca") = containsLetter(token).toString	1✔
213	f("cd") = containsDigit(token).toString	1✔
214	f("cs") = containsSymbol(token).toString	1✔
215
216	f
217	}
218
219	val pairs = Array("w", "pos", "h", "t")	1✔
220	val window = 2	1✔
221
222	def isInRange(idx: Int, size: Int) = idx >= 0 && idx < size	1✔
223
224	def getName(source: String, idx: Int): String = {
225	source + "~" + idx	1✔
226	}
227
228	def getName(source: String, idx1: Int, idx2: Int): String = {
229	getName(source, idx1) + "\|" + getName(source, idx2)	1✔
230	}
231
232	def generate(
233	taggedSentence: TaggedSentence,
234	wordpieceEmbeddingsSentence: WordpieceEmbeddingsSentence): TextSentenceAttrs = {
235
236	val wordFeatures = taggedSentence.words	1✔
237	.zip(taggedSentence.tags)	1✔
238	.map { case (word, tag) =>	1✔
239	val f = fillFeatures(word)	1✔
240	f("pos") = tag	1✔
241	f
242	}
243
244	val words = wordFeatures.length	1✔
245
246	var wordsList = taggedSentence.words.toList	1✔
247	val embeddings = wordpieceEmbeddingsSentence.tokens	1✔
248	.filter(t => t.isWordStart)	1✔
249	.map(t => t.embeddings)	1✔
250
251	assert(	1✔
252	embeddings.length == wordsList.length,	1✔
253	"Mismatched embedding tokens and sentence tokens. Make sure you are properly " +	×
254	"linking tokens and embeddings to the same inputCol DOCUMENT annotator")
255
256	val attrs = (0 until words).map { i =>	1✔
257	val pairAttrs = (-window until window)	1✔
258	.filter(j => isInRange(i + j, words) && isInRange(i + j + 1, words))	1✔
259	.flatMap(j =>	1✔
260	pairs.map { name =>	1✔
261	val feature = getName(name, j, j + 1)	1✔
262	val value1 = wordFeatures(i + j).getOrElse(name, "")	1✔
263	val value2 = wordFeatures(i + j + 1).getOrElse(name, "")	1✔
264	(feature, value1 + "\|" + value2)	1✔
265	})
266	.toArray	1✔
267
268	val unoAttrs = (-window to window)	1✔
269	.filter(j => isInRange(i + j, words))	1✔
270	.flatMap { j =>	1✔
271	wordFeatures(i + j).map { case (name, value) =>	1✔
272	(getName(name, j), value)	1✔
273	}
274	}
275	.toArray	1✔
276
277	val dictAttrs = dictFeatures.get(wordsList).map((getName("dt", i), _))	1✔
278	wordsList = wordsList.tail	1✔
279
280	val addition =
281	if (i == 0) Array(("_BOS_", ""))	1✔
282	else if (i == words - 1) Array(("_EOS_", ""))	1✔
283	else Array.empty[(String, String)]	1✔
284
285	val binAttrs = pairAttrs ++ unoAttrs ++ dictAttrs ++ addition	1✔
286
287	val numAttrs = embeddings(i)	1✔
288
289	WordAttrs(binAttrs, numAttrs)	1✔
290	}
291
292	TextSentenceAttrs(attrs)	1✔
293	}
294
295	def generateDataset(sentences: TraversableOnce[
296	(TextSentenceLabels, TaggedSentence, WordpieceEmbeddingsSentence)]): CrfDataset = {
297	val textDataset = sentences
298	.filter(p => p._2.words.length > 0)	1✔
299	.map { case (labels, sentence, withEmbeddings) =>	1✔
300	val textSentence = generate(sentence, withEmbeddings)	1✔
301	(labels, textSentence)	1✔
302	}
303
304	DatasetReader.encodeDataset(textDataset)	1✔
305	}
306
307	def generate(
308	sentence: TaggedSentence,
309	withEmbeddings: WordpieceEmbeddingsSentence,
310	metadata: DatasetMetadata): Instance = {
311	val attrSentence = generate(sentence, withEmbeddings)	1✔
312
313	DatasetReader.encodeSentence(attrSentence, metadata)	1✔
314	}
315	}

JohnSnowLabs / spark-nlp / 13883000244

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous