• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 7861513225

11 Feb 2024 11:05AM UTC coverage: 62.678% (-0.05%) from 62.731%
7861513225

Pull #14169

github

web-flow
Merge 13f2acde4 into 6010244ba
Pull Request #14169: Fixed a bug with models that has 'onnx_data' file not working in dbfs/hdfs

8951 of 14281 relevant lines covered (62.68%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.89
/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/wordpiece/WordpieceEncoder.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece
18

19
import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, TokenPiece}
20

21
import scala.collection.mutable.ArrayBuffer
22

23
private[johnsnowlabs] class WordpieceEncoder(
24
    vocabulary: Map[String, Int],
25
    unkToken: String = "[UNK]",
26
    maxInputCharsPerWord: Int = 200,
27
    partPrefix: String = "##") {
28

29
  require(vocabulary.contains(unkToken), "token " + unkToken + " not found in vocabulary")
×
30

31
  def encode(token: IndexedToken): Array[TokenPiece] = {
32
    val unkId = vocabulary(unkToken)
1✔
33

34
    if (token.token.length > maxInputCharsPerWord)
1✔
35
      return Array(
×
36
        TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))
×
37

38
    val result = ArrayBuffer[TokenPiece]()
1✔
39

40
    val text = token.token
1✔
41
    var start = 0
1✔
42
    var end = text.length
1✔
43

44
    // Greedy search for next largest substring
45
    while (end > start && start < text.length) {
1✔
46
      val toFind = (if (start > 0) partPrefix else "") + text.substring(start, end)
1✔
47

48
      val found = vocabulary.get(toFind)
1✔
49
      if (found.nonEmpty) {
1✔
50
        val subToken = TokenPiece(
1✔
51
          toFind,
52
          token.token,
1✔
53
          found.get,
1✔
54
          start == 0,
1✔
55
          token.begin + start,
1✔
56
          token.begin + end - 1)
1✔
57
        result.append(subToken)
1✔
58
        start = end
59
        end = text.length
1✔
60
      } else {
1✔
61
        end = end - 1
1✔
62

63
        if (end == start) {
1✔
64
          // Not Found anything in vocabulary
65
          return Array(
1✔
66
            TokenPiece(unkToken, token.token, unkId, isWordStart = true, token.begin, token.end))
1✔
67
        }
68
      }
69
    }
70

71
    result.toArray
1✔
72
  }
73
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc