7861513225

Committed 11 Feb 2024 11:05AM UTC coverage: 62.678% (-0.05%) from 62.731%

Build # 7861513225

Build Type

Pull #14169

github

Committed by

web-flow

Commit Message

Merge 13f2acde4 into 6010244ba

Pull Request Pull Request #14169: Fixed a bug with models that has 'onnx_data' file not working in dbfs/hdfs

Run Details

8951 of 14281 relevant lines covered (62.68%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.89

/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/wordpiece/WordpieceEncoder.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece

import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, TokenPiece}

import scala.collection.mutable.ArrayBuffer

private[johnsnowlabs] class WordpieceEncoder(
    vocabulary: Map[String, Int],
    unkToken: String = "[UNK]",
    maxInputCharsPerWord: Int = 200,
    partPrefix: String = "##") {

  require(vocabulary.contains(unkToken), "token " + unkToken + " not found in vocabulary")

  def encode(token: IndexedToken): Array[TokenPiece] = {
    val unkId = vocabulary(unkToken)

    if (token.token.length > maxInputCharsPerWord)
      return Array(
        TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))

    val result = ArrayBuffer[TokenPiece]()

    val text = token.token
    var start = 0
    var end = text.length

    // Greedy search for next largest substring
    while (end > start && start < text.length) {
      val toFind = (if (start > 0) partPrefix else "") + text.substring(start, end)

      val found = vocabulary.get(toFind)
      if (found.nonEmpty) {
        val subToken = TokenPiece(
          toFind,
          token.token,
          found.get,
          start == 0,
          token.begin + start,
          token.begin + end - 1)
        result.append(subToken)
        start = end
        end = text.length
      } else {
        end = end - 1

        if (end == start) {
          // Not Found anything in vocabulary
          return Array(
            TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))
        }
      }
    }

    result.toArray
  }
}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece
18
19	import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, TokenPiece}
20
21	import scala.collection.mutable.ArrayBuffer
22
23	private[johnsnowlabs] class WordpieceEncoder(
24	vocabulary: Map[String, Int],
25	unkToken: String = "[UNK]",
26	maxInputCharsPerWord: Int = 200,
27	partPrefix: String = "##") {
28
29	require(vocabulary.contains(unkToken), "token " + unkToken + " not found in vocabulary")	×
30
31	def encode(token: IndexedToken): Array[TokenPiece] = {
32	val unkId = vocabulary(unkToken)	1✔
33
34	if (token.token.length > maxInputCharsPerWord)	1✔
35	return Array(	×
36	TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))	×
37
38	val result = ArrayBuffer[TokenPiece]()	1✔
39
40	val text = token.token	1✔
41	var start = 0	1✔
42	var end = text.length	1✔
43
44	// Greedy search for next largest substring
45	while (end > start && start < text.length) {	1✔
46	val toFind = (if (start > 0) partPrefix else "") + text.substring(start, end)	1✔
47
48	val found = vocabulary.get(toFind)	1✔
49	if (found.nonEmpty) {	1✔
50	val subToken = TokenPiece(	1✔
51	toFind,
52	token.token,	1✔
53	found.get,	1✔
54	start == 0,	1✔
55	token.begin + start,	1✔
56	token.begin + end - 1)	1✔
57	result.append(subToken)	1✔
58	start = end
59	end = text.length	1✔
60	} else {	1✔
61	end = end - 1	1✔
62
63	if (end == start) {	1✔
64	// Not Found anything in vocabulary
65	return Array(	1✔
66	TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))	1✔
67	}
68	}
69	}
70
71	result.toArray	1✔
72	}
73	}

JohnSnowLabs / spark-nlp / 7861513225

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous